Advertisement

医疗疾病知识图谱挖掘 实战 计算机毕设

阅读量:
复制代码
 # pip install annoy -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn

    
 # pip install gensim==4.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
    
 from torch import tensor
    
 from sklearn.metrics import f1_score
    
 from datetime import datetime
    
 import time
    
 from collections import Counter
    
 import re
    
 import jieba
    
  
    
 import pandas as pd
    
 import time
    
 import numpy as np
    
 from tqdm import tqdm
    
 import os
    
 import gensim
    
 from gensim.models import KeyedVectors
    
 from gensim.scripts.glove2word2vec import glove2word2vec
    
 from gensim.models import Word2Vec
    
 from gensim.models.word2vec import LineSentence
    
 from tqdm import tqdm
    
 import networkx as nx
    
 import matplotlib.pyplot as plt
    
 import random
    
 import numpy as np
    
 plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    
 plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    
  
    
 import pandas as pd
    
 data=pd.read_excel('病人-疾病-症状.xlsx')
    
 data = data.fillna(1)
    
 print(data.head(5))
    
 data=data.values
    
  
    
 truple_lists=[]
    
 for line in data[:1000]:
    
     # print(line)
    
     p=line[0]
    
     for i in line[1:]:
    
     if i!=1 and i!=' ':
    
         truple_lists.append((p,i))
    
 print(truple_lists)
    
 g = nx.Graph()
    
  
    
 g.add_edges_from(truple_lists)
    
  
    
 # print(g.edges())
    
  
    
 print('g.nodes()',g.nodes())
    
 #
    
 # print(g.degree())
    
  
    
 # 转换节点标签以便画图
    
 # labels = {}
    
 # for node in g.nodes():
    
 #     labels[node] = str(node)
    
  
    
 # nx.draw(g, labels=labels)
    
 # plt.show()
    
  
    
 random.seed(1)  # 设置随机种子用于后边的结果的复现
    
 np.random.seed(0)
    
  
    
  
    
 num_walks = 10
    
 walk_length = 20
    
 walks = []
    
 for i in tqdm(range(num_walks)):
    
     for node in g.nodes():
    
     # print('node', node)
    
     if type(node) == str:
    
         walk = []
    
         walk.append(node)
    
         while (len(walk) < walk_length):
    
                 node_list = list(g.neighbors(node))
    
                 # print('node_list',node_list)
    
                 if len(node_list) == 0:  # 假设在有向图中这个节点没有有向的邻居节点
    
                     break
    
                 #             print("node_list",node_list)
    
                 node = np.random.choice(node_list, 1).item()
    
                 #             print(node)
    
                 #             print("*"*100)
    
                 walk.append(node)
    
         # print(walk)
    
         walks.append(walk)
    
 print(len(walks))
    
 print(np.array(walks).shape)
    
  
    
 # 使用word2vec之前先进行word2vec的语料库训练 只需要训练一次 就可以
    
 with open("word2vec_txt.txt", "a+", encoding='utf-8') as f:
    
     words = []
    
     for i in tqdm(walks):
    
     i = " ".join(i)
    
     f.write(i)
    
     f.write("\n")
    
 model = Word2Vec(LineSentence(open('word2vec_txt.txt', 'r', encoding='utf-8')), sg=0, size=64, window=8,min_count=2, workers=4)
    
 # 模型保存
    
 model.save('word2vec.model')
    
 # 通过模型加载词向量(recommend)
    
  
    
 model_vec = gensim.models.Word2Vec.load('word2vec.model')
    
 dic = model_vec.wv.index2word
    
 # print(dic)
    
 print(len(dic))
    
 print(model_vec.wv['不稳定性心绞痛'])
    
 print(model_vec.wv.most_similar('不稳定性心绞痛', topn=2))
    
 print(model_vec.wv.most_similar('p4', topn=2))
    
  
    
  
    
 # 病人的降维分析聚类可视化
    
 from gensim.models import Word2Vec
    
 from random import sample
    
 from sklearn.manifold import TSNE
    
 from pylab import mpl
    
 mpl.rcParams['font.sans-serif'] = ['SimHei'] #中文字体
    
 mpl.rcParams['axes.unicode_minus'] = False #防止负号出现异常显示
    
 #进行图的选取 选取两个图的点在一个图中显示!!!!!!!
    
 word_list_vec=[]
    
 for word in dic:
    
     if str(word[0])=='p':
    
     vec = model.wv[word]
    
     # print(vec)
    
     word_list_vec.append([word,vec])
    
  
    
  
    
 plt.figure(figsize=(15,15)) #定义画布大小
    
 color=['b',"r","g","k"] # 定义颜色 参数c 可以等于:['c', 'b', 'g', 'r', 'm', 'y', 'k', 'w']
    
 color_label=['b',"r","g","k"]
    
 marker=[" "," "," "," "]
    
 tokens = []
    
 labels = []
    
 for line in word_list_vec:
    
     labels.append(line[0])
    
     tokens.append(line[1]) # 存储的是向量
    
  
    
 tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2500, random_state=23)
    
 #  perplexity: 默认为30,数据集越大,需要参数值越大,建议值位5-50 , n_components=2 默认为2,嵌入空间的维度(嵌入空间的意思就是结果空间),别的参数估计不重要
    
 print(len(tokens))
    
 # print(tokens)
    
 new_values = tsne_model.fit_transform(tokens)
    
 #     将X投影到一个嵌入空间并返回转换结果
    
 #降维处理
    
 #     print(new_values)
    
 x = []
    
 y = []
    
 for value in new_values:
    
     x.append(value[0])
    
     y.append(value[1])
    
 for i in range(len(x)):
    
     plt.scatter(x[i],y[i],c=color[1],marker=marker[1])
    
     plt.text(x[i],y[i], labels[i], fontsize=10,color=color_label[1])
    
 plt.xticks(fontsize=20)
    
 plt.yticks(fontsize=20)
    
  
    
 plt.show()
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/Ctl9VZRwUirmaTIB3QO2AvpDWqL5.png)

全部评论 (0)

还没有任何评论哟~