Advertisement

乳腺癌患者分类

阅读量:

本文使用logistic regression算法,对Breast Cancer Wisconsin (Diagnostic) Database数据集中malignant(恶性)与benign(良性)两种病情进行分类预测。步骤包括数据预处理、特征选择、特征对比、建模与评价几个步骤。

读取数据集

复制代码
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_breast_cancer
    import os
    os.chdir('/Users/zhaohaibo/Desktop')
    # 读取数据集
    data = load_breast_cancer()
    X = data.data
    y = data.target
    # 将data保存为xls
    df = pd.DataFrame(data.data)
    df.columns = data.feature_names
    writer = pd.ExcelWriter('output.xlsx')
    df.to_excel(writer,'Sheet1')
    writer.save()
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

对标签重新编码

  • 因为数据集已经将编码工作做好了,这步可以省略。
复制代码
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)
    
    
      
      
      
    

特征选择

  • 使用RFECV进行特征选择,保留排名前三的数据特征时交叉验证分数最高,
  • 对特征进行排名:worst radius、worst texture、worst concave points。保留这三个特征。
复制代码
    import matplotlib.pyplot as plt
    from sklearn.svm import SVC
    from sklearn.model_selection import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.datasets import make_classification
    from sklearn.tree import DecisionTreeClassifier
    svc = SVC(kernel="linear")
    dt = DecisionTreeClassifier()
    rfecv = RFECV(estimator=dt, step=1, cv=StratifiedKFold(2), scoring='accuracy')
    rfecv.fit(X, y)
    print("Optimal number of features : %d" % rfecv.n_features_)
    print("Ranking of features nums: %s" % data.feature_names[rfecv.ranking_])
    print("Ranking of features names: %s" % rfecv.ranking_)
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.savefig("feature.jpg")
    plt.show()
    
    feature_list = []
    for i in range(len(rfecv.ranking_)):
    if(rfecv.ranking_[i] == 1):
        feature_list.append(i)
    X = X[:,feature_list]
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
在这里插入图片描述

特征分析

  • 使用seaborn.pairplot()
复制代码
    XX = pd.DataFrame(X,columns=data.feature_names[feature_list])
    yy = []
    for i in range(len(y)):
    if(y[i]==0):
        yy.append("malignant")
    else:
        yy.append("benign")
    
    XX['label'] = yy
    
    import seaborn as sns
    sns.pairplot(XX, vars=["worst radius","worst texture", "worst concave points"],
             hue="label", palette="husl",
            markers=["o","x"]
            ,diag_kind="kde")
    plt.savefig("duibi.jpg")
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
在这里插入图片描述

切分数据集

  • 使用sklearn.model_selection.train_test_split
复制代码
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=666, shuffle = False)# shuffle默认为True
    
    
      
      
    

数据归一化处理

  • 使用最大最小值归一化
复制代码
    from sklearn import preprocessing
    min_max_scaler = preprocessing.MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.fit_transform(X_test)
    
    
      
      
      
      
    

网格搜索寻参(存在问题,未解决)

  • (模型使用logistic regression)
复制代码
    param_grid = [
    {
        'tol': [0.00001,0.0001,0.001,0.01,0.1],
        'multi_class':['multinomial','ovr'],
        'c':[0.01,0.1,1,10],
        'class_weight':['balenced','None'],
        'solver':['sag','saga','liblinear','newton-cg'],
        'max_iter':[10,100,1000,10000,100000]
    }
    ]
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    grid_search = GridSearchCV(LogisticRegression(), param_grid)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_estimator_)
    print(grid_search.best_score_)
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

使用logistic regression建模

复制代码
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(penalty='l2',dual=False, tol=0.001, 
                         C=1.0, fit_intercept=True, intercept_scaling=1, 
                         class_weight=None, random_state=0, solver='sag', 
                         max_iter=100, multi_class='multinomial', verbose=1, 
                         warm_start=False, n_jobs=-1)
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    score = clf.score(X_test, y_test)
    print("The accuracy of Logistic Regression classifier:",score)
    
    
      
      
      
      
      
      
      
      
      
      
    

绘制混淆矩阵

复制代码
    def plot_confusion_matrix(cm, classes, normalize=False,
                         title='Confusion matrix',
                         cmap=plt.cm.Blues):
    """
    Normalization can be applied by setting `normalize = True`.
    """
    plt.imshow(cm, interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix, without normalization")
    print(cm)
    
    thresh = cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
        plt.text(j, i, cm[i,j],
                horizontalalignment="center",
                color="white" if cm[i,j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig("matrix.jpg")
    
    from sklearn.metrics import confusion_matrix
    import itertools
    import matplotlib.pyplot as plt
    
    prediction = clf.predict(X_test)
    cm = confusion_matrix(y_test, prediction)
    cm_plot_labels = ['malignant', 'benign']
    plot_confusion_matrix(cm, cm_plot_labels, title='Confusion Matrix')
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
在这里插入图片描述

模型评价

复制代码
    # accuracy_score
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    print("The accuracy of Logistic Regression classifier:",accuracy)
    #precision & recall & f1-score
    from sklearn.metrics import classification_report
    print(classification_report(y_true=y_test, y_pred=prediction))
    
    
      
      
      
      
      
      
      
    
/ Precision Recall F1-score Support
Healthy 0.93 0.98 0.95 43
Cancer 0.99 0.98 0.99 145
Avg/total 0.98 0.98 0.98 188
绘制ROC曲线
复制代码
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_test, prediction)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
    plt.xlabel("FPR (False Positive Rate)")
    plt.ylabel("TPR (True Positive Rate)")
    plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc))
    plt.savefig("ROC.jpg")
    plt.show()
    
    
      
      
      
      
      
      
      
      
      
    
在这里插入图片描述

全部代码:

复制代码
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    """
    Created on Thu Aug  9 23:39:32 2018
    
    @author: zhaohaibo
    """
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_breast_cancer
    from sklearn.preprocessing import LabelEncoder
    import matplotlib.pyplot as plt
    from sklearn.metrics import confusion_matrix
    import itertools
    from sklearn.svm import SVC
    from sklearn import preprocessing
    from sklearn.model_selection import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.datasets import make_classification
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    from sklearn.metrics import roc_curve, auc
    import os
    os.chdir('/Users/zhaohaibo/Desktop')
    
    
    class Cancer(object):
    # def __init__(self):
    
    
    def Encode(self, y):
        encoder = LabelEncoder()
        y = encoder.fit_transform(y)
        return y
    
    
    def RFECV(self, X, y):
        svc = SVC(kernel="linear")
        dt = DecisionTreeClassifier()
        rfecv = RFECV(estimator=dt, step=1, cv=StratifiedKFold(2), scoring='accuracy')
        rfecv.fit(X, y)
        print("Optimal number of features : %d" % rfecv.n_features_)
        print("Ranking of features nums: %s" % data.feature_names[rfecv.ranking_])
        print("Ranking of features names: %s" % rfecv.ranking_)
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
        plt.savefig("feature.jpg")
        plt.show()
        feature_list = []
        for i in range(len(rfecv.ranking_)):
            if(rfecv.ranking_[i] == 1):
                feature_list.append(i)
        X = X[:,feature_list]
        return X, feature_list
    
    
    def Seaborn(self, X, y, feature_list):
        XX = pd.DataFrame(X,columns=data.feature_names[feature_list])
        yy = []
        for i in range(len(y)):
            if(y[i]==0):
                yy.append("malignant")
            else:
                yy.append("benign")
            
        XX['label'] = yy
    
        import seaborn as sns
        sns.pairplot(XX, vars=["worst radius","worst texture", "worst concave points"],
                     hue="label", palette="husl",
                    markers=["o","x"]
                    ,diag_kind="kde")
        plt.savefig("duibi.jpg")
    
    
    def Train_Test_Split(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=666, shuffle = False)# shuffle默认为True
        return X_train, X_test, y_train, y_test
    
    
    def Scaler(self, X_train, X_test):
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train = min_max_scaler.fit_transform(X_train)
        X_test = min_max_scaler.fit_transform(X_test)
        return X_train, X_test
    
    def GridSearch():
        param_grid = [
            {
                'tol': [0.00001,0.0001,0.001,0.01,0.1],
                'multi_class':['multinomial','ovr'],
                'c':[0.01,0.1,1,10],
                'class_weight':['balenced','None'],
                'solver':['sag','saga','liblinear','newton-cg'],
                'max_iter':[10,100,1000,10000,100000]
            }
        ]
        grid_search = GridSearchCV(LogisticRegression(), param_grid)
        grid_search.fit(X_train, y_train)
        print(grid_search.best_estimator_)
        print(grid_search.best_score_)
    
    
    def Model(self, X_train, y_train, X_test, y_test):
        clf = LogisticRegression(penalty='l2',dual=False, tol=0.001, 
                                 C=1.0, fit_intercept=True, intercept_scaling=1, 
                                 class_weight=None, random_state=0, solver='sag', 
                                 max_iter=100, multi_class='multinomial', verbose=1, 
                                 warm_start=False, n_jobs=-1)
        clf.fit(X_train, y_train)
        clf.predict(X_test)
        score = clf.score(X_test, y_test)
        print("The accuracy of Logistic Regression classifier:",score)
        return clf
    
    
    def plot_confusion_matrix(self, cm, classes, normalize=False,
                                 title='Confusion matrix',
                                 cmap=plt.cm.Blues):
        """
        Normalization can be applied by setting `normalize = True`.
        """
        plt.figure()
        plt.imshow(cm, interpolation='nearest',cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)
        
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
            print("Normalized confusion matrix")
        else:
            print("Confusion matrix, without normalization")
        print(cm)
        
        thresh = cm.max() / 2
        for i, j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
            plt.text(j, i, cm[i,j],
                    horizontalalignment="center",
                    color="white" if cm[i,j] > thresh else "black")
            
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.savefig("matrix.jpg")
    
    
    def Matrix(self, y_test, prediction):
        cm = confusion_matrix(y_test, prediction) 
        cm_plot_labels = ['malignant', 'benign']
        self.plot_confusion_matrix(cm, cm_plot_labels, title='Confusion Matrix')
    
    
    
    def Metrics(self, y_test, prediction):
        accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
        print("The accuracy of Logistic Regression classifier:",accuracy)
        #precision & recall & f1-score
        print(classification_report(y_true=y_test, y_pred=prediction))
        #绘制ROC曲线
        fpr, tpr, thresholds = roc_curve(y_test, prediction)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
        plt.xlabel("FPR (False Positive Rate)")
        plt.ylabel("TPR (True Positive Rate)")
        plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc))
        plt.savefig("ROC.jpg")
        plt.show()
    
     
    
    def SaveExl(data):
        df = pd.DataFrame(data.data)
        df.columns = data.feature_names
        writer = pd.ExcelWriter('output.xlsx')
        df.to_excel(writer,'Sheet1')
        writer.save()
    
    
    def main(self, X, y):
        y = self.Encode(y)
        X, feature_list = self.RFECV(X, y)
        self.Seaborn(X, y, feature_list)
        X_train, X_test, y_train, y_test = self.Train_Test_Split(X, y)
        X_train, X_test = self.Scaler(X_train, X_test)
        # self.GridSearch()
        clf = self.Model(X_train, y_train, X_test, y_test)
        prediction = clf.predict(X_test)
        self.Matrix(y_test, prediction)
        self.Metrics(y_test, prediction)
        #self.SaveExl(data)
    
    
    if __name__ == '__main__':
    data = load_breast_cancer()
    X = data.data
    y = data.target
    cancer = Cancer()
    cancer.main(X, y)
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

全部评论 (0)

还没有任何评论哟~