一、建立数据库连接
这里使用PostgreSQL
from sqlalchemy import create_engine
name = "postgres"
pw = "postgres"
host = "127.0.0.1"
port = "5432"
database = "postgres"
# 用sqlalchemy构建数据库链接engine
connect_info = "postgresql+psycopg2://{}:{}@{}:{}/{}".format(
name, pw, host, port, database
)
database_info = create_engine(connect_info)
二、药品名称字典读取
import pandas as pd
drugs=pd.read_sql('SELECT DISTINCT "药品名称" FROM "表名";',database_info)
三、添加字典
import jieba
import jieba.posseg as pseg #词性标注
for drug in [drug[0] for drug in drugs.values]:
# print(drug)
jieba.add_word(drug, freq=42, tag='药品') #设置词频和词性
四、构造函数实现词性标记并过滤药品
# 集合
def get_drugs(sentence):
words = pseg.cut(sentence)
drugs=set()#集合
for word, flag in words:
# print("{0} {1}".format(word, flag))
if flag=='药品':
drugs.add(word)
return ';'.join(drugs)
五、函数测试
sentence='Nab-paclitaxel (260mg/m2) will be administered every 3 weeks'
get_drugs(sentence)
六、函数运用
df=pd.read_sql('SELECT DISTINCT "文本" FROM "表名" WHERE "文本" IS NOT NULL;',database_info)
df.head()
df['drug']=df.apply(lambda x : get_drugs(x['文本']),axis=1)
七、数据保存
df.to_sql('表名',database_info,if_exists='replace')