Advertisement

数据挖掘学习笔记

阅读量:

数据挖掘学习笔记

该套facebook数据集包含若干条记录信息中涉及的字段特征。其中x₁至x₁₀为已知属性参数y为目标预测变量需完成预处理步骤通常包括导入必要的库包并加载数据集进行后续分析工作

复制代码
    import numpy as np
    import pandas as pd
    from statsmodels.formula.api import ols
    import statsmodels.api as sm
    ## 导入数据
    train_facebook = pd.read_csv(r"E:\wechart\train_facebook.csv")  ## 完整数据
    p_facebook = pd.read_csv(r"E:\wechart\train_facebook_p.csv")     ## 残缺数据
    ## 数据设定
    train = p_facebook[0:10000]                                 ## 训练集
    predict = p_facebook[10001:]                                 ## 要预测的
    true_value = train_facebook[10001:]                          ## 真实值
    
    predict = p_facebook[10001:] 
    predict = predict.iloc[:,:-1]

分析数据发现x₁, x₃, x₅, x₇, x₉分别对应字母H,C,D,S;而其余变量则取自数字范围从一到十三。
第一种思路是采用线性回归模型用于预测目标变量y.

复制代码
    ## 生成虚拟变量
    predict['x1_H']=np.nan
    predict['x1_C']=np.nan
    predict['x1_D']=np.nan
    predict['x3_H']=np.nan
    predict['x3_C']=np.nan
    predict['x3_D']=np.nan
    predict['x5_H']=np.nan
    predict['x5_C']=np.nan
    predict['x5_D']=np.nan
    predict['x7_H']=np.nan
    predict['x7_C']=np.nan
    predict['x7_D']=np.nan
    predict['x9_H']=np.nan
    predict['x9_C']=np.nan
    predict['x9_D']=np.nan
    for i in range(1,6):
    for j in range(0,8865):
        ## 按照x1,3,5,7,9是否为H,生成虚拟变量x_H
        if predict.iloc[j,2*i-1]=='H':
            predict.iloc[j,3*i+8] = 1
        else:
            predict.iloc[j,3*i+8] = 0
        ## 按照x1,3,5,7,9是否为C,生成虚拟变量x_C
        if predict.iloc[j,2*i-1]=='C':
            predict.iloc[j,3*i+9] = 1
        else:
            predict.iloc[j,3*i+9] = 0
        ## 按照x1,3,5,7,9是否为D,生成虚拟变量 x_D   
        if predict.iloc[j,2*i-1]=='D':
            predict.iloc[j,3*i+10] = 1
        else:
            predict.iloc[j,3*i+10] = 0
    ##  要预测的x的值的转换
    train['x1_H']=np.nan
    train['x1_C']=np.nan
    train['x1_D']=np.nan
    train['x3_H']=np.nan
    train['x3_C']=np.nan
    train['x3_D']=np.nan
    train['x5_H']=np.nan
    train['x5_C']=np.nan
    train['x5_D']=np.nan
    train['x7_H']=np.nan
    train['x7_C']=np.nan
    train['x7_D']=np.nan
    train['x9_H']=np.nan
    train['x9_C']=np.nan
    train['x9_D']=np.nan
    for i in range(1,6):
    for j in range(0,10000):
        ## 按照x1,3,5,7,9是否为H,生成虚拟变量x_H
        if train.iloc[j,2*i-1]=='H':
            train.iloc[j,3*i+9] = 1
        else:
            train.iloc[j,3*i+9] = 0
        ## 按照x1,3,5,7,9是否为C,生成虚拟变量x_C
        if train.iloc[j,2*i-1]=='C':
            train.iloc[j,3*i+10] = 1
        else:
            train.iloc[j,3*i+10] = 0
        ## 按照x1,3,5,7,9是否为D,生成虚拟变量 x_D   
        if train.iloc[j,2*i-1]=='D':
            train.iloc[j,3*i+11] = 1
        else:
            train.iloc[j,3*i+11] = 0

回归并预测

复制代码
    lm_s = ols('y ~ x2 + x4 + x6 + x8 + x10  + x1_H + x1_C + x1_D + x3_H + x3_C + x3_D + x5_H + x5_C +x5_D + x7_H+ x7_C  +x7_D+x9_H +x9_C+x9_D ', data=train,).fit()
    print(lm_s.params)
    lm_s.summary()
    result=lm_s.predict(predict)
    true_rate_ols=sum(round(result)==true_value['y'])/8865
    true_rate_ols

正确率也只有2.5%,表现尚可者也无从及之。
由此可见,在面对此类离散选择模型时应用线性回归的效果并不理想,
其水平已不如随机猜测(后者仅能以10%的成功概率进行预测),转而考虑采用专门针对分类任务设计的算法

KNN算法

复制代码
    from  sklearn.neighbors  import KNeighborsClassifier
    import matplotlib.pyplot as plt
    %matplotlib inline
    import matplotlib
    ## 将原始数据中的字母转换为数值
    knn_train_x=train.iloc[:,1:11]
    for i in range(0,9,2):
    knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='D']=0
    knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='C']=1
    knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='H']=2
    knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='S']=3        
    knn_train_y=train.iloc[:,11]
    knn_predict=predict.iloc[:,1:11]
    for i in range(0,9,2):
    knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='D']=0
    knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='C']=1
    knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='H']=2
    knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='S']=3  
    knn= KNeighborsClassifier(n_neighbors=5)
    knn.fit(knn_train_x,knn_train_y)
    knn_result =knn.predict(knn_predict)
    true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
    true_rate_knn

准确率有52.6%,还不错,能否继续增加准确率呢

复制代码
    ## 调整参数,提升准确率
    result_list=[]
    n=40
    for i in range(1,n):
    knn= KNeighborsClassifier(n_neighbors=i)
    knn.fit(knn_train_x,knn_train_y)
    knn_result =knn.predict(knn_predict)
    true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
    result_list.append(true_rate_knn)
    x=[i for i in range(1,n)]
    matplotlib.rcParams['font.family']='SimHei'  ##可以显示中文
    plt.figure(figsize=(12, 9))
    plt.plot(x,result_list)
    plt.xlabel('n_neighbors',size=20)
    plt.ylabel('准确率',size=20)
相关图

随着n的增加,准确率逐渐提升,n增加到1

  1. List item

0以后,增加不再明显,呈现波动状态,准确率为56%左右

复制代码
    result_list=[]
    n=40
    for i in range(1,n):
    knn= KNeighborsClassifier(n_neighbors=i,weights='distance',n_jobs=-1)
    knn.fit(knn_train_x,knn_train_y)
    knn_result =knn.predict(knn_predict)
    true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
    result_list.append(true_rate_knn)
    x=[i for i in range(1,n)]
    matplotlib.rcParams['font.family']='SimHei'  ##可以显示中文
    plt.figure(figsize=(12, 9))
    plt.plot(x,result_list)
    plt.xlabel('n_neighbors',size=20)
    plt.ylabel('准确率',size=20)
在这里插入图片描述

加权算法准确率提高了一些,有57%左右了

Logistic回归算法

线性回归效果很差,不妨试试专门用来处理选择模型的Logistic回归

复制代码
    from sklearn.linear_model import LogisticRegression
    ## 将原始数据中的字母转换为数值,与knn中一样
    log_train_x=train.iloc[:,1:11]
    for i in range(0,9,2):
    log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='D']=0
    log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='C']=1
    log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='H']=2
    log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='S']=3        
    log_train_y=train.iloc[:,11]
    log_predict=predict.iloc[:,1:11]
    for i in range(0,9,2):
    log_predict.iloc[:,i][log_predict.iloc[:,i]=='D']=0
    log_predict.iloc[:,i][log_predict.iloc[:,i]=='C']=1
    log_predict.iloc[:,i][log_predict.iloc[:,i]=='H']=2
    log_predict.iloc[:,i][log_predict.iloc[:,i]=='S']=3  
    lg=LogisticRegression()
    lg.fit(log_train_x,log_train_y)
    log_result=lg.predict(log_predict)
    true_rate_log=sum(log_result==true_value.iloc[:,-1])/8865
    true_rate_log
    ## 准确率接近50%,一般般

决策树算法

复制代码
    from sklearn.tree import DecisionTreeClassifier
    from sklearn import tree
    tree_train_x = log_train_x
    tree_train_y = log_train_y
    tree_predict = log_predict
    clf = DecisionTreeClassifier(criterion='gini')
    clf.fit(tree_train_x,tree_train_y)
    true_rate_tree=(clf.predict(tree_predict)==true_value.iloc[:,-1]).mean()
    true_rate_tree
    ## 准确率为48.6%

随机森林算法

复制代码
    from sklearn.ensemble import RandomForestClassifier
    forest_train_x = log_train_x
    forest_train_y = log_train_y
    forest_predict = log_predict
    forest = RandomForestClassifier(n_estimators=500)
    forest.fit(forest_train_x,forest_train_y)
    true_rate_forest = (forest.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_forest
    ## 准确率59% , 有所增加

极度随机森林算法

复制代码
    from sklearn.ensemble import ExtraTreesClassifier
    extraTree = ExtraTreesClassifier(n_estimators= 100)
    extraTree.fit(forest_train_x,forest_train_y)
    true_rate_extraForest = (extraTree.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_extraForest
    ## 准确率56%,还行

Adaboost算法`

复制代码
    from sklearn.ensemble import AdaBoostClassifier
    ada = AdaBoostClassifier(n_estimators=100)
    ada.fit(forest_train_x,forest_train_y)
    true_rate_Ada = (ada.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_Ada
    ## 准确率48.6%

Gbdt算法

复制代码
    from sklearn.ensemble import GradientBoostingClassifier
    gbdt = GradientBoostingClassifier()
    gbdt.fit(forest_train_x,forest_train_y)
    true_rate_Gbdt = (gbdt.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_Gbdt
    ## 准确率60.5% ,继续提高

LGBM算法

复制代码
    # pip install lightgbm
    from lightgbm import LGBMClassifier
    ## 将dataframe中的Object类型转换为int类型
    forest_train_x['x1']=pd.to_numeric(forest_train_x['x1'])
    forest_train_x['x3']=pd.to_numeric(forest_train_x['x3'])
    forest_train_x['x5']=pd.to_numeric(forest_train_x['x5'])
    forest_train_x['x7']=pd.to_numeric(forest_train_x['x7'])
    forest_train_x['x9']=pd.to_numeric(forest_train_x['x9'])
    forest_predict['x1']=pd.to_numeric(forest_predict['x1'])
    forest_predict['x3']=pd.to_numeric(forest_predict['x3'])
    forest_predict['x5']=pd.to_numeric(forest_predict['x5'])
    forest_predict['x7']=pd.to_numeric(forest_predict['x7'])
    forest_predict['x9']=pd.to_numeric(forest_predict['x9'])
    lgbm = LGBMClassifier()
    lgbm.fit(forest_train_x,forest_train_y)
    true_rate_lgbm = (lgbm.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_lgbm
    ## 准确率为61.5%,又有提升

xgbost算法

复制代码
    from xgboost import XGBClassifier
    xgb = XGBClassifier()
    xgb.fit(forest_train_x,forest_train_y)
    true_rate_xgb = (xgb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_xgb
    ## 准确率为69%,提升很大

高斯分布朴素贝叶斯算

复制代码
    from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
    gnb = GaussianNB()
    gnb.fit(forest_train_x,forest_train_y)
    true_rate_gnb= (gnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_gnb
    ##  准确率48.5%

多项式分布朴素贝叶斯

算法

复制代码
    mnb = MultinomialNB()
    mnb.fit(forest_train_x,forest_train_y)
    true_rate_mnb= (mnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_mnb
    ##  准确率48.4%

伯努利分布朴素贝叶斯

算法

复制代码
    bnb = BernoulliNB()
    bnb.fit(forest_train_x,forest_train_y)
    true_rate_bnb= (bnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
    true_rate_bnb
    ##  准确率49.8%

试过多种算法后发现,并非所有的都具有相同的适用性。其中用于实验分析的线性回归模型表现不佳。其余所有算法的准确率均维持在50%左右,并且其精度可达到69%,远高于其他同类方法。然而,在这些方法中发现了一个显著的优势:即以xgboost算法表现出色。

全部评论 (0)

还没有任何评论哟~