导入所需的库
import nltk import jieba import jieba.posseg import nltk import pandas as pd
清洗数据
hlm=open('/home/bluejade/AI/ML/CODE/nlp/聊天机器人/《红楼梦》完整版.txt').read() hlm=hlm.replace('\n','') hlm=hlm.replace('\u3000','') hlm=hlm.replace('“','') hlm=hlm.replace('”','') hlm=hlm.replace(',','') hlm=hlm.replace('。','') hlm=hlm.replace('\'','') hlm=hlm.replace(';','') hlm=hlm.replace(':','') hlm=hlm.replace('、','') hlm=hlm.replace('!','') hlm=hlm.replace('’','') hlm=hlm.replace('’','') hlm=hlm.replace('?','') hlm=hlm.replace('‘','') hlm=hlm.replace('.','') hlm=hlm.replace(' ','') hlm=hlm.replace('》','') hlm=hlm.replace('《','')
词性标注
tagged=jieba.posseg.cut(hlm) words=[] worders=[] for word,worder in tagged: # print(word,worder) words.append(word) worders.append(worder)
保存标记好的词性在本地
#添加表头,就是两个列‘词’,‘词性’ table=pd.DataFrame(columns=['词','词性']) table['词']=words table['词性']=worders ? table.to_excel('/home/bluejade/AI/ML/CODE/nlp/聊天机器人/《红楼梦》完整版词性标准.xlsx',index=True)
[object Object]