基于 python 机器学习的中风患者疾病分析预测 完整数据代码可直接运行
发布时间
阅读量:
阅读量
数据分析结果展示:



完整代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif']='simhei'
plt.rcParams['axes.unicode_minus']=False
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载数据集
df = pd.read_csv('brain_stroke.csv',encoding='GB2312')
print(df.head(5))
print(df.info())
# 设置可视化数据列的百分比
def annot_plot(ax,data):
ax.spines["top"].set_visible(False)#设置顶部边框为空
ax.spines["right"].set_visible(False) #设置右侧边框为空
# ax.patch.set_alpha(0) # 设置背景为透明
for p in ax.patches:
ax.annotate(f"{p.get_height()*100/data.shape[0]:.2f}%",(p.get_x()+p.get_width()/2,p.get_height()),
ha="center",va="center",fontsize=10,color="black",rotation=0,xytext=(0,10),textcoords="offset points")
object_cols = ['性别', '是否患有高血压', '是否患有心脏病', '是否有过婚姻', '工作类型', '住宅类型', '吸烟状况']
num_cols = ['年龄', '血糖水平', 'BMI']
fig, ax = plt.subplots(figsize=(15, 24))
idx = 0
for i in object_cols:
plt.subplot(4, 3, idx + 1)
ax = sns.countplot(x=i, data=df, hue=df['是否中风'])
annot_plot(ax, df)
plt.xlabel(None)
plt.ylabel("中风发病率")
# plt.xticks(None)
plt.title(i)
title_obj = ax.title # 获取当前图形对象的子对象(标题对象)
title_obj.set_fontsize(12) # 设置标题字体大小
plt.legend()
idx += 1
for i in num_cols:
plt.subplot(4, 3, idx + 1)
sns.distplot(df[i])
annot_plot(ax, df)
plt.ylabel(None)
plt.title(i)
plt.xlabel(None)
idx += 1
plt.plot()
plt.show()
#数据中文字转变为数据类型
map_dict1={"男性":1,"女性":0,"城市":1,"农村":0}
df['性别']=df['性别'].map(map_dict1)
df['住宅类型']=df['住宅类型'].map(map_dict1)
print(df.head(5))
# 查看工作类型的类别,吸烟状况的类别
b=df['吸烟状况'].unique()
a=df['工作类型'].unique()
#分别转变为数据类型
map_dict2={"不详":0,"从不吸烟":1,"以前吸烟":2,"吸烟":3,"儿童":0,"自雇人士":1,"私人企业":2,"政府部门":3}
df['工作类型']=df['工作类型'].map(map_dict2)
df['吸烟状况']=df['吸烟状况'].map(map_dict2)
print(df.head(5))
# 将BMI列中数据按照分段规则进行分类
bmi_category = np.where(df['BMI'] < 18.5, 0,
np.where((df['BMI'] >= 18.5) & (df['BMI'] < 24), 1,
np.where((df['BMI'] >= 24) & (df['BMI'] < 28), 2, 3)))
# 将分类结果添加到数据集中
df['BMI类别'] = bmi_category
#将血糖按规则分类
blood_sugar = np.where(df['血糖水平']<70,0,
np.where(df['血糖水平']>100,2,1))
df['血糖类别'] = blood_sugar
data =df[['性别','年龄','是否患有高血压','是否患有心脏病','是否有过婚姻',
'工作类型','住宅类型','血糖类别','BMI类别','吸烟状况','是否中风']]
#可视化特征之间的相关关系
corr = df.corr()
# 绘制关联图
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
# 分割特征和目标变量
X = df[['性别','年龄','是否患有高血压','是否患有心脏病',
'工作类型','BMI类别','吸烟状况','血糖水平']]
y = df['是否中风'] #,'BMI','是否有过婚姻','血糖类别'
##对特征进行特征组合
# 创建PolynomialFeatures对象,并设置交互特征的阶数
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True)
# 对特征矩阵进行特征交互处理
X_poly = poly.fit_transform(X)
# 输出交互特征后的特征矩阵
print(X_poly.shape)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
# from lime import lime_tabular
# import lime
import matplotlib as mpl
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# 创建逻辑回归模型
model = LogisticRegression()
# 训练模型
model.fit(X_train, y_train)
# 在测试集上进行预测
y_pred = model.predict(X_test)
# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# 对模型进行评估
print('逻辑回归训练集精确率:%.3f' % model.score(X_train, y_train))
print('逻辑回归测试集精确率:%.3f' % accuracy)
# 定义空列表存储训练准确率和测试准确率
train_acc_list = []
test_acc_list = []
# 定义不同的正则化参数C的取值范围
C_values = [0.001, 0.01, 0.1, 1, 10, 100]
# 训练模型并记录准确率
for C in C_values:
model = LogisticRegression(C=C)
model.fit(X_train, y_train)
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
# 可视化训练准确率和测试准确率随着C的变化情况
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.plot(C_values, train_acc_list, label='Train accuracy')
ax1.plot(C_values, test_acc_list, label='Test accuracy')
ax1.set_xscale('log')
ax1.set_ylim([0.8, 1])
ax1.set_xlabel('C')
ax1.set_ylabel('Accuracy')
ax1.legend(loc='best')
# 绘制每个特征的系数
coef = model.coef_[0]
feature_names = X.columns
feature_names = [name.replace('_', ' ').title() for name in feature_names]
y_pos = range(len(feature_names))
ax2.barh(y_pos, coef)
ax2.set_yticks(y_pos)
ax2.set_yticklabels(feature_names)
ax2.set_xlabel('Coefficient')
plt.tight_layout()
plt.show()
代码解读
完整数据代码:<>
全部评论 (0)
还没有任何评论哟~
