Advertisement

逻辑回归算法——乳腺癌检测

阅读量:

这个数据集总共从病灶造影图片中提取 以下 10 个关键属性。
• radius 半径 即病灶中心点离边界的平均距离
• texture 纹理,灰度值的标准偏差。
• perimeter 周长 即病灶的大小
• area 积,也是反映病灶大小一个指标。
• smoothness 平滑度 ,即半径的变化幅度。
• actness :密实度,周长的平方除以面积的商 再减1
• concavity 凹度,凹陷部分轮廓的严重程度
• concave points 凹点 凹陷轮廓的数量。
• symmetry 对称性。
• fractal dimension 分形维度
复合指标,是事物内在逻辑关系的体现

复制代码
 #载入数据

    
 from sklearn.datasets import load_breast_cancer
    
  
    
 cancer=load_breast_cancer()
    
 X=cancer.data
    
 y=cancer.target
    
 print('data shape:{0};no. posttive:{1};no. negative:{2}'.format(X.shape,y[y==1].shape[0],y[y==0].shape[0]))
    
 #data shape:(569, 30);no. posttive:357;no. negative:212
    
    
    
    
复制代码
 from sklearn.model_selection import train_test_split

    
 from sklearn.linear_model import LogisticRegression
    
 import numpy as np
    
  
    
 X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
    
 model=LogisticRegression()
    
 model.fit(X_train,y_train)
    
  
    
 train_score=model.score(X_train,y_train)
    
 test_score=model.score(X_test,y_test)
    
 print('train score:{0:.6f};test score:{1:.6f}'.format(train_score,test_score))
    
  
    
 #检查预测正确率
    
 y_pred=model.predict(X_test)
    
 print('matches:{0}/{1}'.format(np.equal(y_pred,y_test).shape[0],y_test.shape[0]))
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/ACgRG5i47hbjIslDMqtmZuWVXa03.png)

全部预测正确,为什么测试评分不是1呢?因为sklearn 是使用预测概率数据来计算模型评分的针对二元分类问题, Logistic Regression 模型会针对每个样本输出两个概率,即为0的概率和为1的概率, 哪个概率高就预测为哪个类别。

复制代码
 #模型预测的自信度

    
 #找出自信度不足90%的模型预测
    
 y_pred_proba=model.predict_proba(X_test)
    
 result=y_pred_proba[y_pred_proba[:,0]>0.1]#y_pred_proba[:,0]>0.1获得布尔值,最后显示true值
    
 result[result[:,1]>0.1]
    
    
    
    

模型优化

复制代码
 #模型优化

    
 #用多项式特征
    
 from sklearn.linear_model import LogisticRegression
    
 from sklearn.preprocessing import PolynomialFeatures
    
 from sklearn.pipeline import Pipeline
    
 import time
    
  
    
 #约定俗成:
    
 #*args:可以理解为长度不固定的列表。
    
 #**kwarg:可以理解为长度不固定的字典
    
 def polynomial_model(degree=1,**kwarg):
    
     polynomial_features = PolynomialFeatures(degree=degree,include_bias=False)#include_bias:默认为 True 。如果为 True 的话,那么结果中就会有 0 次幂项
    
     logistic_reg=LogisticRegression(**kwarg)
    
     pipeline=Pipeline([('polynomial_features',polynomial_features),
    
                    ('logistic_reg',logistic_reg)])
    
     return pipeline
    
  
    
 model=polynomial_model(degree=2,penalty='l1',solver='liblinear')#penalty:惩罚 
    
 #决定惩罚项选择的有2个参数:dual和solver,如果要选L1范数,dual必须是False,solver必须是liblinear
    
 #L1范数作为正则项,可使参数稀疏化,即自动帮助我们选出那些对模型有关联的特征
    
  
    
 start=time.perf_counter()
    
 model.fit(X_train,y_train)
    
 poly_train_score=model.score(X_train,y_train)
    
 poly_test_score=model.score(X_test,y_test)
    
 print('elaspe:{0:.6f};poly_train_score:{1:.6f};poly_test_score:{2:.6f}'.format(time.perf_counter()-start,poly_train_score,poly_test_score))
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/xS8VERmDCtzrWLI7yFYhNg64J2Up.png)
复制代码
 #查看有多少特征没有被丢弃,即对应的参数西塔非0

    
 coefs=model.named_steps['logistic_reg'].coef_
    
 #named_steps 流水线中的一条流水线名
    
 #coef_ 属性里保存的就是模型参数
    
 print("总特征数:{0};非0特征数:{1}".format(coefs.shape, np.count_nonzero(coefs)))
    
 #输入特征由原来的 30个增加到了 495 个,最终大多数特征都被丢弃,只保留了86 个有效特
    
    
    
    

画学习曲线,找出使算法准确性最高的参数值

复制代码
 import numpy as np

    
 import matplotlib.pyplot as plt
    
 from sklearn.model_selection import learning_curve
    
  
    
 plt.rcParams['font.sans-serif'] = ['SimHei']#显示中文
    
  
    
 def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
    
                     n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    
     plt.figure()
    
     plt.title(title)
    
     if ylim is not None:
    
     plt.ylim(*ylim)
    
     plt.xlabel("Training examples")
    
     plt.ylabel("Score")
    
     train_sizes, train_scores, test_scores = learning_curve(
    
     estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
     train_scores_mean = np.mean(train_scores, axis=1)
    
     train_scores_std = np.std(train_scores, axis=1)
    
     test_scores_mean = np.mean(test_scores, axis=1)
    
     test_scores_std = np.std(test_scores, axis=1)
    
     plt.grid()# 生成网格
    
     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
    
                  train_scores_mean + train_scores_std, alpha=0.1,
    
                  color="r")
    
     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
    
                  test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
    
     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score") 
    
     plt.legend(loc="best")#添加图例
    
     return plt
    
  
    
 from sklearn.model_selection import ShuffleSplit
    
 import time
    
  
    
 cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=0)
    
  
    
 #start=time.perf_counter()
    
 plot_learning_curve(polynomial_model(degree=1,penalty='l1',solver='liblinear'),"Learning Curves (degree=1 , penalty=l1)",X,y,ylim=(0.8,1.01),cv=cv)
    
 plot_learning_curve(polynomial_model(degree=2,penalty='l1',solver='liblinear'),"Learning Curves (degree=2 , penalty=l1)",X,y,ylim=(0.8,1.01),cv=cv)
    
 plot_learning_curve(polynomial_model(degree=1,penalty='l2',solver='lbfgs'),"Learning Curves (degree=1 , penalty=l2)",X,y,ylim=(0.8,1.01),cv=cv)
    
 plot_learning_curve(polynomial_model(degree=2,penalty='l2',solver='lbfgs'),"Learning Curves (degree=2 , penalty=l2)",X,y,ylim=(0.8,1.01),cv=cv)
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/KJ079GLQqN6syAhOcRv8oZkCljDE.png)


针对大数据集 怎样高效的画学习曲线?

答案很简单, 我们可以从大数据集里选择部分数据来画学习曲线 择好最优的模型之后 使用全部的数据集来训练模型。有个地方需要注意,我们要尽量保持选择出来的这部分数据的标签分部与大数据集的标签分布相同。考虑分层抽样。

包有:

1、from sklearn.model_selection import StratifiedShuffleSplit

2、使用sklearn.model_selection.train_test_split,参数stratify即用来指定按照某一特征进行分层抽样,生成训练集和测试集。

全部评论 (0)

还没有任何评论哟~