Advertisement

数据挖掘(Python)——利用sklearn进行数据挖掘,实现算法:svm、knn、C5.0、NaiveBayes

阅读量:
复制代码
    <span style="line-height: 18px; font-family: Consolas, 'Courier New', Courier, mono, serif; background-color: rgb(255, 255, 255);">说明:下面程序将利用python中的sklearn包根据用户流失前的行为,对用户的流失进行预测</span>

数据格式说明: lost为二元分类标记符,在数据分析中通常用1或0表示用户是否流失;其他行为特征则反映了用户的使用情况;以下展示了部分数据样本

复制代码
 import xlrd

    
 import string
    
 import sklearn
    
 from sklearn import svm
    
 from sklearn import neighbors
    
 from sklearn import cluster
    
 from sklearn import tree
    
 from sklearn import naive_bayes 
    
 import numpy as np
    
  
    
 #--------数据加载开始--------------
    
 def data_import(filepath):
    
  data=xlrd.open_workbook(filepath)
    
  table=data.sheet_by_index(0)
    
  nrows=table.nrows
    
  ncols=table.ncols
    
  result=[]
    
  result.append(table.row_values(0))
    
  for i in range(1,nrows):
    
      result_0=[]
    
      for j in range(0,ncols):                          
    
       result_0=result_0+[table.cell(i,j).value]
    
      result.append(result_0)
    
  result=result[1:]
    
  return result
    
  
    
 filepath1=r'F:\data_retain.xls'
    
 result1=data_import(filepath1)
    
 count1=len(result1)
    
 target1=[]
    
 data1=[]
    
 for i in range(0,count1):
    
      data1.append(result1[i][1:])
    
      target1.append(result1[i][0])
    
  
    
 filepath2=r'F:\data_lost.xls'
    
 result2=data_import(filepath2)
    
 count2=len(result2)
    
 target2=[]
    
 data2=[]
    
 for i in range(0,count2):
    
      data2.append(result2[i][1:])
    
      target2.append(result2[i][0])
    
  
    
 data=[]
    
 target=[]
    
 len_1=10000
    
 for i in range(0,len_1):
    
      data.append(data1[i])
    
      target.append(target1[i])
    
  
    
 len_2=10000
    
 for i in range(0,len_2):
    
      data.append(data2[i])
    
      target.append(target2[i])
    
 #--------数据加载完毕--------------
    
  
    
 class machine_learn:
    
     def svm_(self,data,target):
    
     #支持向量机算法预测
    
     svc = svm.SVC(kernel='linear')
    
     svc.fit(data,target)
    
     predict_target=svc.predict(data)
    
     temp0_0=0
    
     temp0_1=0
    
     temp1_0=0
    
     temp1_1=0
    
     for i in range(0,len(target)):
    
         if predict_target[i]==0:
    
             if target[i]==0:
    
                 temp0_0=temp0_0+1
    
             else:
    
                 temp0_1=temp0_1+1
    
         else:
    
             if target[i]==0:
    
                 temp1_0=temp1_0+1
    
             else:
    
                 temp1_1=temp1_1+1
    
     kind_0_precision=temp0_0/(temp0_0+temp0_1)
    
     kind_0_penetration=temp0_0/(temp0_0+temp1_0)
    
     kind_1_precision=temp1_1/(temp1_0+temp1_1)
    
     kind_1_penetration=temp1_1/(temp0_1+temp1_1)
    
     return [[kind_0_precision,kind_0_penetration],[kind_1_precision,kind_1_penetration]]
    
  
    
     def knn_(self,data,target):
    
     #最邻近算法算法预测
    
     knn = neighbors.KNeighborsClassifier()
    
     knn.fit(data,target)
    
     predict_target=knn.predict(data)
    
     temp0_0=0
    
     temp0_1=0
    
     temp1_0=0
    
     temp1_1=0
    
     for i in range(0,len(target)):
    
         if predict_target[i]==0:
    
             if target[i]==0:
    
                 temp0_0=temp0_0+1
    
             else:
    
                 temp0_1=temp0_1+1
    
         else:
    
             if target[i]==0:
    
                 temp1_0=temp1_0+1
    
             else:
    
                 temp1_1=temp1_1+1
    
     kind_0_precision=temp0_0/(temp0_0+temp0_1)
    
     kind_0_penetration=temp0_0/(temp0_0+temp1_0)
    
     kind_1_precision=temp1_1/(temp1_0+temp1_1)
    
     kind_1_penetration=temp1_1/(temp0_1+temp1_1)
    
     return [[kind_0_precision,kind_0_penetration],[kind_1_precision,kind_1_penetration]]
    
  
    
     def tree_(self,data,target):
    
     #决策树算法预测
    
     tre = tree.DecisionTreeClassifier()
    
     tre.fit(data,target)
    
     predict_target=tre.predict(data)
    
     temp0_0=0
    
     temp0_1=0
    
     temp1_0=0
    
     temp1_1=0
    
     for i in range(0,len(target)):
    
         if predict_target[i]==0:
    
             if target[i]==0:
    
                 temp0_0=temp0_0+1
    
             else:
    
                 temp0_1=temp0_1+1
    
         else:
    
             if target[i]==0:
    
                 temp1_0=temp1_0+1
    
             else:
    
                 temp1_1=temp1_1+1
    
     kind_0_precision=temp0_0/(temp0_0+temp0_1)
    
     kind_0_penetration=temp0_0/(temp0_0+temp1_0)
    
     kind_1_precision=temp1_1/(temp1_0+temp1_1)
    
     kind_1_penetration=temp1_1/(temp0_1+temp1_1)
    
     return [[kind_0_precision,kind_0_penetration],[kind_1_precision,kind_1_penetration]]
    
  
    
     def bayes_(self,data,target):
    
     #使用贝叶斯算法预测(GaussianNB)
    
     bayes = naive_bayes.GaussianNB()
    
     bayes.fit(data,target)
    
     predict_target=bayes.predict(data)
    
     temp0_0=0
    
     temp0_1=0
    
     temp1_0=0
    
     temp1_1=0
    
     for i in range(0,len(target)):
    
         if predict_target[i]==0:
    
             if target[i]==0:
    
                 temp0_0=temp0_0+1
    
             else:
    
                 temp0_1=temp0_1+1
    
         else:
    
             if target[i]==0:
    
                 temp1_0=temp1_0+1
    
             else:
    
                 temp1_1=temp1_1+1
    
     kind_0_precision=temp0_0/(temp0_0+temp0_1)
    
     kind_0_penetration=temp0_0/(temp0_0+temp1_0)
    
     kind_1_precision=temp1_1/(temp1_0+temp1_1)
    
     kind_1_penetration=temp1_1/(temp0_1+temp1_1)
    
     return [[kind_0_precision,kind_0_penetration],[kind_1_precision,kind_1_penetration]]
    
  
    
  
    
 a=machine_learn()
    
 svm_method=a.svm_(data,target)
    
 print ('svm 方法预测,它的准确率可以达到 %f, 覆盖率达到 %f' % (svm_method[1][0],svm_method[1][1]))
    
 knn_method=a.knn_(data,target)
    
 print ('knn 方法预测,它的准确率可以达到 %f, 覆盖率达到 %f' % (knn_method[1][0],knn_method[1][1]))
    
 tree_method=a.tree_(data,target)
    
 print ('决策树 方法预测,它的准确率可以达到 %f, 覆盖率达到 %f' % (tree_method[1][0],tree_method[1][1]))
    
 bayes_method=a.bayes_(data,target)
    
 print ('NaiveBayes 方法预测,它的准确率可以达到 %f, 覆盖率达到 %f' % (bayes_method[1][0],bayes_method[1][1]))
复制代码
复制代码
    1、当训练样本中,流失用户:留存用户调整为1:1时,决策树模型的前10个规则可以通过覆盖20%的频道用户来定位60%的流失用户
复制代码
    2、<span style="font-family: Arial, Helvetica, sans-serif;">当训练样本中,流失用户:留存用户调整为1:3时,决策树模型的前10个规则可以通过覆盖25.3%的频道用户来定位65.6%的流失用户</span>
复制代码
    <span style="font-family: Arial, Helvetica, sans-serif;"></span><pre name="code" class="python">3、当训练样本中,流失用户:留存用户调整为1:1时,增加部分变量,决策树模型的前10个规则可以通过覆盖15%的频道用户来定位47%的流失用户
复制代码

全部评论 (0)

还没有任何评论哟~