Advertisement

SVM向量机——预测乳腺癌

阅读量:

SVC类是用来进行分类的任务, SVR 类是用来进行数值回归任务的

SVM选择的核函数由参数kernel指定
线性核函数,指定参数C,表示对不符合最大间距规则的样本的惩罚力度
多项式核函数,指定参数C,degree(阶数)
高斯核函数,指定参数C,gamma

复制代码
 #画出分隔超平面

    
 import numpy as np
    
 def plot_hyperplane(clf, X, y, h=0.02, draw_sv=True, title='hyperplan'):
    
     # create a mesh to plot in
    
     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    
     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    
     xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
    
     # meshgrid():把x,y数据生成mesh网格状的数据,因为等高线的显示是在网格的基础上添加上高度值
    
  
    
     #np.arange()函数分为一个参数,两个参数,三个参数三种情况
    
     #1)一个参数时,参数值为终点,起点取默认值0,步长取默认值1。
    
     #2)两个参数时,第一个参数为起点,第二个参数为终点,步长取默认值1。
    
     #3)三个参数时,第一个参数为起点,第二个参数为终点,第三个参数为步长。其中步长支持小数
    
  
    
     plt.title(title)
    
     plt.xlim(xx.min(), xx.max())
    
     plt.ylim(yy.min(), yy.max())
    
     plt.xticks(())#x轴刻度
    
     plt.yticks(())#y轴刻度
    
  
    
     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # SVM的分割超平面   
    
     #ravel():将多维数组转换为一维数组
    
     #np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等。
    
     #np.c_是按行连接两个矩阵,就是把两矩阵左右相加,要求行数相等。
    
     
    
     # Put the result into a color plot
    
     Z = Z.reshape(xx.shape)
    
     plt.contourf(xx, yy, Z, cmap='hot', alpha=0.5) # 填充等高线
    
     #contour和contourf都是画三维等高线图的,不同点在于contour() 是绘制轮廓线,contourf()会填充轮廓。
    
     
    
     markers = ['o', 's', '^']
    
     colors = ['b', 'r', 'c']
    
     labels = np.unique(y)
    
     for label in labels:
    
     plt.scatter(X[y==label][:, 0], X[y==label][:, 1], c=colors[label], marker=markers[label])
    
     # 画出支持向量
    
     if draw_sv:
    
     sv = clf.support_vectors_# 获得支持向量
    
     plt.scatter(sv[:, 0], sv[:, 1], c='y', marker='x')
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/GqPlZQhYgSH7VWdez4uCmib5noLf.png)

先用线性核函数:

复制代码
 from sklearn import svm

    
 from sklearn.datasets import make_blobs
    
 import matplotlib.pyplot as plt
    
  
    
 plt.rcParams['font.sans-serif'] = ['SimHei']#显示中文
    
  
    
 X,y=make_blobs(n_samples=100,centers=2,random_state=0,cluster_std=0.3)
    
 #生成数据集  
    
 #n_features表示每一个样本有多少特征值
    
 #n_samples表示样本的个数
    
 #centers是聚类中心点的个数,可以理解为label的种类数
    
 #random_state是随机种子,可以固定生成的数据
    
 #cluster_std设置每个类别的方差
    
  
    
 clf=svm.SVC(C=1.0,kernel='linear')
    
 clf.fit(X,y)
    
 plt.figure(figsize=(12,4),dpi=144)
    
 plot_hyperplane(clf,X,y,h=0.1,title="分隔超平面分类算法")
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/TixkwKuysRV3vgGYonUt510WXQ9d.png)

比较线性核函数、多项式核函数、高斯核函数

复制代码
 from sklearn import svm

    
 from sklearn.datasets import make_blobs
    
 import matplotlib.pyplot as plt
    
  
    
  
    
 X,y=make_blobs(n_samples=100,centers=3,random_state=0,cluster_std=0.8)
    
 clf_linear=svm.SVC(C=1.0,kernel='linear')
    
 clf_poly=svm.SVC(C=1.0,kernel='poly',degree=3)
    
 clf_rbf=svm.SVC(C=1.0,kernel='rbf',gamma=0.5)
    
 clf_rbf2=svm.SVC(C=1.0,kernel='rbf',gamma=0.1)
    
  
    
 plt.figure(figsize=(10,10),dpi=144)
    
  
    
 clfs=[clf_linear,clf_poly,clf_rbf,clf_rbf2]
    
 titles=['Linear Kernel','Polynomial Kernel with Degree=3','Gaussian Kernel with gamma=0.5','Gaussian Kernel with gamma=0.1']
    
  
    
 for clf,i in zip(clfs,range(len(clfs))):
    
     clf.fit(X,y)
    
     plt.subplot(2,2,i+1)
    
     plot_hyperplane(clf,X,y,h=0.1,title=titles[i])             
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/NAiwcDjhgSvlxEoGnCke54dXZ6Im.png)

带x标记的点是支持向量,为什么离分割超平面最近的点是支持向量,离很远的点也是支持向量呢?

因为高斯核函数把输入特征向量映射到无限维的向量空间里,在高维空间中,他们是支持向量

正题——预测乳腺癌

复制代码
 from sklearn import svm

    
 from sklearn.datasets import load_breast_cancer
    
 from sklearn.model_selection import train_test_split
    
  
    
 cancer=load_breast_cancer()
    
 X=cancer.data
    
 y=cancer.target
    
 print("data shape:{0},no. positive:{1},no. negative:{2}".format(X.shape,y[y==1].shape,y[y==0].shape))
    
  
    
 X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/nAhtlsvo70wJdNEBDHuYOrk6IGac.png)

猜测:数据集小,高斯核函数效果应该不是很好,验证如下:

复制代码
 from sklearn.svm import SVC

    
 clf_rbf_cancer=SVC(C=1.0,kernel='rbf',gamma=0.1)
    
 clf_rbf_cancer.fit(X_train,y_train)
    
 rbf_cancer_train_score=clf_rbf.score(X_train,y_train)
    
 rbf_cancer_test_score=clf_rbf.score(X_test,y_test)
    
 print('rbf : train_score:{0},test_score:{1}'.format(rbf_cancer_train_score,rbf_cancer_test_score))
    
    
    
    

网格搜索最优参数:

复制代码
 from sklearn.model_selection import GridSearchCV

    
 import numpy as np
    
  
    
 gammas=np.linspace(0,0.0003,30)
    
 param_grid={'gamma':gammas}
    
 grid_clf=GridSearchCV(SVC(),param_grid=param_grid,cv=5,return_train_score=True)
    
 grid_clf.fit(X,y)
    
 print("best param:{0} best score:{1}".format(grid_clf.best_params_,grid_clf.best_score_))
    
    
    
    

画出不同参数对应的得分:

复制代码
 def plot_curve(train_sizes, cv_results, xlabel):

    
     train_scores_mean = cv_results['mean_train_score']
    
     train_scores_std = cv_results['std_train_score']
    
     test_scores_mean = cv_results['mean_test_score']
    
     test_scores_std = cv_results['std_test_score']
    
     plt.figure(figsize=(10, 6), dpi=144)
    
     plt.title('parameters turning')
    
     plt.grid()
    
     plt.xlabel(xlabel)
    
     plt.ylabel('score')
    
     plt.fill_between(train_sizes, 
    
                  train_scores_mean - train_scores_std,
    
                  train_scores_mean + train_scores_std, 
    
                  alpha=0.1, color="r")
    
     plt.fill_between(train_sizes, 
    
                  test_scores_mean - test_scores_std,
    
                  test_scores_mean + test_scores_std, 
    
                  alpha=0.1, color="g")
    
     plt.plot(train_sizes, train_scores_mean, '.--', color="r",
    
          label="Training score")
    
     plt.plot(train_sizes, test_scores_mean, '.-', color="g",
    
          label="Cross-validation score") 
    
     plt.legend(loc="best")
    
  
    
 plot_curve(gammas,grid_clf.cv_results_,xlabel='rbf gammas')
    
  
    
     #plt.fill_between(x, 0, y, facecolor='green', alpha=0.3)
    
     #x:第一个参数表示覆盖的区域,我直接复制为x,表示整个x都覆盖
    
     #0:表示覆盖的下限
    
     #y:表示覆盖的上限是y这个曲线
    
     #facecolor:覆盖区域的颜色
    
     #alpha:覆盖区域的透明度[0,1],其值越大,表示越不透明
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/eVjOPoDchIEpXgiyUMm1uWZLn9Ga.png)

画出高斯核函数,gamma=0.01的学习曲线

复制代码
 #画学习曲线

    
 import numpy as np
    
 import matplotlib.pyplot as plt
    
 from sklearn.model_selection import learning_curve
    
  
    
 plt.rcParams['font.sans-serif'] = ['SimHei']#显示中文
    
  
    
 def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
    
                     n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    
     plt.figure()
    
     plt.title(title)
    
     if ylim is not None:
    
     plt.ylim(*ylim)
    
     plt.xlabel("Training examples")
    
     plt.ylabel("Score")
    
     train_sizes, train_scores, test_scores = learning_curve(
    
     estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
     train_scores_mean = np.mean(train_scores, axis=1)
    
     train_scores_std = np.std(train_scores, axis=1)
    
     test_scores_mean = np.mean(test_scores, axis=1)
    
     test_scores_std = np.std(test_scores, axis=1)
    
     plt.grid()# 生成网格
    
     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
    
                  train_scores_mean + train_scores_std, alpha=0.1,
    
                  color="r")
    
     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
    
                  test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
    
     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score") 
    
     plt.legend(loc="best")#添加图例
    
     return plt
    
  
    
 from sklearn.model_selection import ShuffleSplit
    
 cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=0)#10折
    
  
    
 plot_learning_curve(SVC(C=1.0,kernel='rbf',gamma=0.01),"rbf 学习曲线",X,y,cv=cv)
    
    
    
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-17/MgDvs5q9CJXNaEdlbyS84euVI30h.png)

高斯核函数效果不好,尝试用多项式核函数:

复制代码
 #二阶多项式核函数

    
 from sklearn.svm import SVC
    
 clf_poly2=SVC(C=1.0,kernel='poly',degree=2)
    
 clf_poly2.fit(X_train,y_train)
    
 poly2_train_score=clf_poly2.score(X_train,y_train)
    
 poly2_test_score=clf_poly2.score(X_test,y_test)
    
 print('poly : train_score:{0},test_score:{1}'.format(poly2_train_score,poly2_test_score))
    
    
    
    

全部评论 (0)

还没有任何评论哟~