Advertisement

基于 python 机器学习的中风患者疾病分析预测 完整数据代码可直接运行

阅读量:

数据分析结果展示:

完整代码:

复制代码
 import pandas as pd

    
 import numpy as np
    
 import matplotlib.pyplot as plt
    
 import seaborn as sns
    
 plt.rcParams['font.sans-serif']='simhei'
    
 plt.rcParams['axes.unicode_minus']=False
    
 import warnings
    
 warnings.filterwarnings("ignore")
    
 from sklearn.preprocessing import StandardScaler
    
 from xgboost import XGBClassifier
    
 from sklearn.ensemble import StackingClassifier
    
 from sklearn.neural_network import MLPClassifier
    
  
    
  
    
 from sklearn.model_selection import train_test_split
    
 from sklearn.linear_model import LogisticRegression
    
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
  
    
 from sklearn.model_selection import train_test_split
    
 from sklearn.ensemble import RandomForestClassifier
    
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
  
    
 # 加载数据集
    
 df = pd.read_csv('brain_stroke.csv',encoding='GB2312')
    
 print(df.head(5))
    
  
    
 print(df.info())
    
 # 设置可视化数据列的百分比
    
 def annot_plot(ax,data):
    
     ax.spines["top"].set_visible(False)#设置顶部边框为空
    
     ax.spines["right"].set_visible(False) #设置右侧边框为空
    
     # ax.patch.set_alpha(0) # 设置背景为透明
    
     for p in ax.patches:
    
     ax.annotate(f"{p.get_height()*100/data.shape[0]:.2f}%",(p.get_x()+p.get_width()/2,p.get_height()),
    
                ha="center",va="center",fontsize=10,color="black",rotation=0,xytext=(0,10),textcoords="offset points")
    
 object_cols = ['性别', '是否患有高血压', '是否患有心脏病', '是否有过婚姻', '工作类型', '住宅类型', '吸烟状况']
    
 num_cols = ['年龄', '血糖水平', 'BMI']
    
 fig, ax = plt.subplots(figsize=(15, 24))
    
 idx = 0
    
 for i in object_cols:
    
     plt.subplot(4, 3, idx + 1)
    
     ax = sns.countplot(x=i, data=df, hue=df['是否中风'])
    
     annot_plot(ax, df)
    
     plt.xlabel(None)
    
     plt.ylabel("中风发病率")
    
     # plt.xticks(None)
    
     plt.title(i)
    
     title_obj = ax.title  # 获取当前图形对象的子对象(标题对象)
    
     title_obj.set_fontsize(12)  # 设置标题字体大小
    
     plt.legend()
    
     idx += 1
    
  
    
 for i in num_cols:
    
     plt.subplot(4, 3, idx + 1)
    
     sns.distplot(df[i])
    
     annot_plot(ax, df)
    
     plt.ylabel(None)
    
     plt.title(i)
    
     plt.xlabel(None)
    
     idx += 1
    
  
    
 plt.plot()
    
 plt.show()
    
 #数据中文字转变为数据类型
    
 map_dict1={"男性":1,"女性":0,"城市":1,"农村":0}
    
 df['性别']=df['性别'].map(map_dict1)
    
 df['住宅类型']=df['住宅类型'].map(map_dict1)
    
 print(df.head(5))
    
  
    
 # 查看工作类型的类别,吸烟状况的类别
    
 b=df['吸烟状况'].unique()
    
 a=df['工作类型'].unique()
    
 #分别转变为数据类型
    
 map_dict2={"不详":0,"从不吸烟":1,"以前吸烟":2,"吸烟":3,"儿童":0,"自雇人士":1,"私人企业":2,"政府部门":3}
    
 df['工作类型']=df['工作类型'].map(map_dict2)
    
 df['吸烟状况']=df['吸烟状况'].map(map_dict2)
    
 print(df.head(5))
    
  
    
 # 将BMI列中数据按照分段规则进行分类
    
 bmi_category = np.where(df['BMI'] < 18.5, 0,
    
                     np.where((df['BMI'] >= 18.5) & (df['BMI'] < 24), 1,
    
                              np.where((df['BMI'] >= 24) & (df['BMI'] < 28), 2, 3)))
    
  
    
 # 将分类结果添加到数据集中
    
 df['BMI类别'] = bmi_category
    
 #将血糖按规则分类
    
 blood_sugar = np.where(df['血糖水平']<70,0,
    
                   np.where(df['血糖水平']>100,2,1))
    
 df['血糖类别'] = blood_sugar
    
 data =df[['性别','年龄','是否患有高血压','是否患有心脏病','是否有过婚姻',
    
       '工作类型','住宅类型','血糖类别','BMI类别','吸烟状况','是否中风']]
    
  
    
 #可视化特征之间的相关关系
    
 corr = df.corr()
    
 # 绘制关联图
    
 plt.figure(figsize=(10, 8))
    
 sns.heatmap(corr, annot=True, cmap='coolwarm')
    
 plt.title('Correlation Heatmap')
    
 plt.show()
    
 # 分割特征和目标变量
    
 X = df[['性别','年龄','是否患有高血压','是否患有心脏病',
    
       '工作类型','BMI类别','吸烟状况','血糖水平']]
    
 y = df['是否中风']  #,'BMI','是否有过婚姻','血糖类别'
    
  
    
 ##对特征进行特征组合
    
  
    
  
    
  
    
 # 创建PolynomialFeatures对象,并设置交互特征的阶数
    
 from sklearn.preprocessing import PolynomialFeatures
    
 poly = PolynomialFeatures(degree=2, interaction_only=True)
    
  
    
 # 对特征矩阵进行特征交互处理
    
 X_poly = poly.fit_transform(X)
    
  
    
 # 输出交互特征后的特征矩阵
    
 print(X_poly.shape)
    
 # 划分训练集和测试集
    
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
 from sklearn.model_selection import train_test_split, GridSearchCV
    
 from sklearn.feature_selection import SelectKBest, f_classif
    
 from sklearn.pipeline import Pipeline
    
 # from lime import lime_tabular
    
 # import lime
    
  
    
 import matplotlib as mpl
    
  
    
 mpl.rcParams['font.family'] = 'sans-serif'
    
 mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei']
    
  
    
 # 创建逻辑回归模型
    
 model = LogisticRegression()
    
  
    
 # 训练模型
    
 model.fit(X_train, y_train)
    
  
    
 # 在测试集上进行预测
    
 y_pred = model.predict(X_test)
    
  
    
 # 评估模型性能
    
 accuracy = accuracy_score(y_test, y_pred)
    
 precision = precision_score(y_test, y_pred)
    
 recall = recall_score(y_test, y_pred)
    
 f1 = f1_score(y_test, y_pred)
    
  
    
 # 对模型进行评估
    
 print('逻辑回归训练集精确率:%.3f' % model.score(X_train, y_train))
    
 print('逻辑回归测试集精确率:%.3f' % accuracy)
    
  
    
  
    
 # 定义空列表存储训练准确率和测试准确率
    
 train_acc_list = []
    
 test_acc_list = []
    
  
    
 # 定义不同的正则化参数C的取值范围
    
 C_values = [0.001, 0.01, 0.1, 1, 10, 100]
    
  
    
 # 训练模型并记录准确率
    
 for C in C_values:
    
     model = LogisticRegression(C=C)
    
     model.fit(X_train, y_train)
    
  
    
     train_acc = model.score(X_train, y_train)
    
     test_acc = model.score(X_test, y_test)
    
  
    
     train_acc_list.append(train_acc)
    
     test_acc_list.append(test_acc)
    
  
    
 # 可视化训练准确率和测试准确率随着C的变化情况
    
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
 ax1.plot(C_values, train_acc_list, label='Train accuracy')
    
 ax1.plot(C_values, test_acc_list, label='Test accuracy')
    
 ax1.set_xscale('log')
    
 ax1.set_ylim([0.8, 1])
    
 ax1.set_xlabel('C')
    
 ax1.set_ylabel('Accuracy')
    
 ax1.legend(loc='best')
    
  
    
 # 绘制每个特征的系数
    
 coef = model.coef_[0]
    
 feature_names = X.columns
    
 feature_names = [name.replace('_', ' ').title() for name in feature_names]
    
 y_pos = range(len(feature_names))
    
 ax2.barh(y_pos, coef)
    
 ax2.set_yticks(y_pos)
    
 ax2.set_yticklabels(feature_names)
    
 ax2.set_xlabel('Coefficient')
    
  
    
 plt.tight_layout()
    
 plt.show()
    
  
    
    
    
    
    代码解读

完整数据代码:<>

全部评论 (0)

还没有任何评论哟~