由于RER和RE是分开进行的,即pipeline形式进行知识抽取,其过程中数据流的走向如下:
原始Word/Excel数据->筛选提取文本->分词标注生成ent_ann文件->生成BIO标注集训练数据(分词方案)->NER->模型预测->导出预测结果ent_ann文件->生成rel_ann文件->RE->模型预测->导出预测结果rel_ann文件
NER模型训练结果预测示例
NER预测的ann文件再做预处理追加relation标注
def get_relation_ann(ent_dir, rel_dir):
# 清空目录
ls = os.listdir(rel_dir)
if ls:
for i in ls:
os.remove(os.path.join(rel_dir, i))
def ent_cat(str):
ent_id, label, text = str.strip().split('\t')
category, pos = label.split(' ', 1)
pos = pos.split(' ')
start, end = int(pos[0]), int(pos[-1])
return ent_id, category, start
fileidxs = set()
for filename in os.listdir(ent_dir):
fileidxs.add(filename.split('.')[0])
for fileidx in fileidxs:
with open(ent_dir + fileidx + '.ann', 'r', encoding='utf8') as inp, \
open(rel_dir + fileidx + '.ann', 'w', encoding='utf-8') as outp:
count = 1
# 导出实体ann标注
contents = inp.readlines()
outp.writelines(contents)
# 遍历实体标注生成距离<50的关系对标注
for i in range(len(contents)):
ent_id, category, start = ent_cat(contents[i])
for j in range(i + 1, len(contents)):
ent_id2, category2, start2 = ent_cat(contents[j])
relation = ''
tmp = category + '-' + category2
tmp2 = category2 + '-' + category
if tmp in RELATIONS:
relation = tmp
elif tmp2 in RELATIONS:
relation = tmp2
if relation in RELATIONS and abs(start2 - start) < 50:
res = 'R{}\t{} Arg1:{} Arg2:{}\n'.format(count, relation, ent_id, ent_id2)
# print(res)
outp.write(res)
count += 1
RE模型训练结果预测示例
最终结果
由实体关系抽取结果可以看到,英文、单个字符可以首先排除,其他部分很难做处理。
rels_开头的第一行是原始按序遍历的关系数目,第二行是去重后的关系数目。
导入图数据库neo4j
class MedicalGraph:
def __init__(self):
self.rel_dir = rel_result
self.g = Graph(
host="127.0.0.1", # neo4j 搭载服务器的ip地址,ifconfig可获取到
http_port=7474, # neo4j 服务器监听的端口号
user="neo4j", # 数据库user name,如果没有更改过,应该是neo4j
password="1234")
def read_nodes(self):
# 共7类节点
drugs = [] # 药品
examinations = [] # 检查
operations = [] # 手术
departments = [] # 科室
anatomy = [] # 部位
diseases = [] # 疾病
symptoms = [] # 症状
disease_infos = [] # 疾病信息
# 构建节点实体关系
rels_drug = [] # 疾病-通用药品关系
rels_examinations = [] # 疾病-检查关系
rels_symptom = [] # 疾病症状关系
rels_acompany = [] # 疾病并发关系
rels_category = [] # 疾病与科室之间的关系
rels_anatomy = [] # 疾病与部位之间的关系
rels_operations = [] # 疾病与手术之间的关系
entities_list = {}
for root, dirs, rels in os.walk(self.rel_dir):
for rel in rels:
rel_path = os.path.join(root, rel)
# print('rel_path:', rel_path)
with open(rel_path, 'r', encoding='utf8') as inp:
contents = inp.readlines()
for i in range(len(contents)):
if contents[i].startswith('T'):
ent_id, label, text = contents[i].strip().split('\t')
category, pos = label.split(' ', 1)
if len(text) > 1:
if category == 'drug':
drugs.append(text)
elif category == 'examination':
examinations.append(text)
elif category == 'operation':
operations.append(text)
elif category == 'department':
departments.append(text)
elif category == 'anatomy':
anatomy.append(text)
elif category == 'disease':
diseases.append(text)
# disease_infos.append(text)
elif category == 'symptom':
symptoms.append(text)
entities_list[ent_id] = text
elif contents[i].startswith('R'):
rel_id, label = contents[i].strip().split('\t')
category, arg1, arg2 = label.split(' ')
arg1 = arg1.split(':')[1]
arg2 = arg2.split(':')[1]
# print(arg1, arg2)
entity1 = entities_list[arg1]
entity2 = entities_list[arg2]
# print(entity1)
# print(entity2)
# disease_infos.append(entities_list[arg1])
if len(entity1) > 1 and len(entity2) > 1:
if category == 'disease-drug':
rels_drug.append([entity1, entity2])
elif category == 'disease-examination':
rels_examinations.append([entity1, entity2])
elif category == 'disease-symptom':
rels_symptom.append([entity1, entity2])
elif category == 'disease-disease':
if entity1 != entity2:
rels_acompany.append([entity1, entity2])
elif category == 'disease-department':
rels_category.append([entity1, entity2])
elif category == 'disease-anatomy':
rels_anatomy.append([entity1, entity2])
elif category == 'disease-operation':
rels_operations.append([entity1, entity2])
print('drugs:', len(drugs))
print('operations:', len(operations))
print('departments:', len(departments))
print('examinations:', len(examinations))
print('anatomy:', len(anatomy))
print('symptoms:', len(symptoms))
print('diseases:', len(diseases))
print('rels_drug:', len(rels_drug))
rels_drug = self.shuffle(rels_drug)
print('rels_drug:', len(rels_drug))
print('rels_examinations:', len(rels_examinations))
rels_examinations = self.shuffle(rels_examinations)
print('rels_examinations:', len(rels_examinations))
print('rels_symptom:', len(rels_symptom))
rels_symptom = self.shuffle(rels_symptom)
print('rels_symptom:', len(rels_symptom))
print('rels_acompany:', len(rels_acompany))
rels_acompany = self.shuffle(rels_acompany)
print('rels_acompany:', len(rels_acompany))
print('rels_category:', len(rels_category))
rels_category = self.shuffle(rels_category)
print('rels_category:', len(rels_category))
print('rels_anatomy:', len(rels_anatomy))
rels_anatomy = self.shuffle(rels_anatomy)
print('rels_anatomy:', len(rels_anatomy))
print('rels_operations:', len(rels_operations))
rels_operations = self.shuffle(rels_operations)
print('rels_operations:', len(rels_operations))
return set(drugs), set(operations), set(departments), set(examinations), set(anatomy), set(symptoms), \
set(diseases), disease_infos, rels_drug, rels_examinations, rels_symptom, rels_acompany, \
rels_category, rels_anatomy, rels_operations
def shuffle(self, rel_list):
rels = []
for item in rel_list:
if item not in rels:
rels.append(item)
return rels
'''建立节点'''
def create_node(self, label, nodes):
count = 0
for node_name in nodes:
node = Node(label, name=node_name)
self.g.create(node)
count += 1
print(count, len(nodes))
return
'''创建知识图谱中心疾病的节点'''
def create_diseases_nodes(self, disease_infos):
count = 0
for disease_dict in disease_infos:
node = Node("Disease", name=disease_dict['name'], desc=disease_dict['desc'],
prevent=disease_dict['prevent'] ,cause=disease_dict['cause'],
easy_get=disease_dict['easy_get'],cure_lasttime=disease_dict['cure_lasttime'],
cure_department=disease_dict['cure_department']
,cure_way=disease_dict['cure_way'] , cured_prob=disease_dict['cured_prob'])
self.g.create(node)
count += 1
print(count)
return
'''创建知识图谱实体节点类型schema'''
def create_graphnodes(self):
Drugs, Operations, Departments, Examinations, Anatomy, Symptoms, Diseases, \
disease_infos,rels_drug, rels_examinations, rels_symptom, rels_acompany, rels_category, \
rels_anatomy,rels_operations = self.read_nodes()
# self.create_diseases_nodes(disease_infos)
self.create_node('Disease', Diseases)
self.create_node('Drug', Drugs)
print(len(Drugs))
self.create_node('Operation', Operations)
print(len(Operations))
self.create_node('Examination', Examinations)
print(len(Examinations))
self.create_node('Department', Departments)
print(len(Departments))
self.create_node('Anatomy', Anatomy)
print(len(Anatomy))
self.create_node('Symptom', Symptoms)
return
'''创建实体关系边'''
def create_graphrels(self):
Drugs, Operations, Departments, Examinations, Anatomy, Symptoms, Diseases, \
disease_infos, rels_drug, rels_examinations, rels_symptom, rels_acompany, rels_category, \
rels_anatomy, rels_operations = self.read_nodes()
self.create_relationship('Disease', 'Drug', rels_drug, 'common_drug', '常用药品')
self.create_relationship('Disease', 'Examination', rels_examinations, 'need_check', '诊断检查')
self.create_relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状')
self.create_relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症')
self.create_relationship('Disease', 'Department', rels_category, 'belongs_to', '所属科室')
self.create_relationship('Disease', 'Anatomy', rels_anatomy, 'in_anatomy', '部位')
self.create_relationship('Disease', 'Operation', rels_operations, 'need_operation', '手术')
'''创建实体关联边'''
def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
count = 0
# 去重处理
set_edges = []
for edge in edges:
set_edges.append('###'.join(edge))
all = len(set(set_edges))
for edge in set(set_edges):
edge = edge.split('###')
p = edge[0]
q = edge[1]
query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
start_node, end_node, p, q, rel_type, rel_name)
# print(query)
try:
self.g.run(query)
count += 1
print(rel_type, count, all)
except Exception as e:
print(e)
return