Advertisement

Python金融数据挖掘 第八章 复习思考题1

阅读量:

一、问题

医院礼品店累计完成了五笔交易,详细列示于表8-3中,请求使用Apriori算法进行关联规则分析。

(1)使用数字代替商品完成第3列;

(2)计算购买“鲜花”的支持度;

(3)计算购买“慰问卡”的支持度;

(4)计算同时购买“鲜花”和购买“慰问卡”的支持度和置信度;

(5)使用Python对表8-3中的购买记录清单进行Apriori关联规则分析。

{1:鲜花、2:慰问卡、3:苏打水、4:毛绒玩具熊、5:气球、6:糖果}

二、代码

(1)使用数字代替商品完成第3列;

表 8-3 某医院礼品店购买清单

交易序号 购买礼品清单 使用数字代替
1 鲜花、慰问卡、苏打水 1,2,3
2 毛线玩具熊、鲜花、气球、糖果 4,1,5,6
3 慰问卡、糖果、鲜花 2,6,1
4 毛线玩具熊、气球、苏打水 4,5,3
5 鲜花、慰问卡、苏打水 1,2,3
复制代码
 # (1)使用数字代替商品完成第3列

    
 print('{1:鲜花、2:慰问卡、3:苏打水、4:毛绒玩具熊、5:气球、6:糖果}')
    
 def loadDataSet():#函数,基础知识
    
     return[[1,2,3],[4,1,5,6],[2,6,1],[4,5,3],[1,2,3]]
    
 D=loadDataSet()
    
 print(D)

(2)计算购买“鲜花”的支持度;

复制代码
 # (2)计算购买“鲜花”的支持度

    
 # 1-项集C1
    
 def createC1(dataSet):
    
     C=[]
    
     for transaction in dataSet:
    
     for item in transaction:
    
         if [item] not in C:
    
             C.append([item])
    
     C.sort()
    
     return list(map(frozenset,C))
    
  
    
 C1=createC1(D)
    
 # print('C1:',C1)
    
  
    
 # 输出频繁1-项集ret1、所有1-项集的支持度suD
    
 def scanD(D,Ck,minSupport):
    
     ssCnt={}
    
     for tid in D:
    
     for can in Ck:
    
         if can.issubset(tid):
    
             ssCnt[can]=ssCnt.get(can,0)+1
    
     numItems=float(len(D))
    
     retList=[]
    
     supportData={}
    
     for key in ssCnt:
    
     support=ssCnt[key]/numItems
    
     if support>=minSupport:
    
         retList.insert(0,key)
    
     supportData[key]=support
    
     return retList,supportData
    
  
    
 ret1,suD=scanD(loadDataSet(),createC1(loadDataSet()),0.22)
    
 # print('ret1:',ret1)
    
 # print('suD:',suD)
    
 print('购买“鲜花”的支持度{}。'.format(suD[frozenset({1})]))

(3)计算购买“慰问卡”的支持度;

复制代码
 # (3)计算购买“慰问卡”的支持度

    
 # 1-项集C1
    
 def createC1(dataSet):
    
     C=[]
    
     for transaction in dataSet:
    
     for item in transaction:
    
         if [item] not in C:
    
             C.append([item])
    
     C.sort()
    
     return list(map(frozenset,C))
    
  
    
 C1=createC1(D)
    
 # print('C1:',C1)
    
  
    
 # 输出频繁1-项集ret1、所有1-项集的支持度suD
    
 def scanD(D,Ck,minSupport):
    
     ssCnt={}
    
     for tid in D:
    
     for can in Ck:
    
         if can.issubset(tid):
    
             ssCnt[can]=ssCnt.get(can,0)+1
    
     numItems=float(len(D))
    
     retList=[]
    
     supportData={}
    
     for key in ssCnt:
    
     support=ssCnt[key]/numItems
    
     if support>=minSupport:
    
         retList.insert(0,key)
    
     supportData[key]=support
    
     return retList,supportData
    
  
    
 ret1,suD=scanD(loadDataSet(),createC1(loadDataSet()),0.22)
    
 # print('ret1:',ret1)
    
 # print('suD:',suD)
    
 print('购买“慰问卡”的支持度{}。'.format(suD[frozenset({2})]))

(4)计算同时购买“鲜花”和购买“慰问卡”的支持度和置信度;

复制代码
 # (4)计算同时购买“鲜花”和购买“慰问卡”的支持度和置信度

    
 # 生成2-项集
    
 def aprioriGen(Ck,k):
    
     retList=[]
    
     lenCk=len(Ck)
    
     for i in range(lenCk):
    
     for j in range(i+1,lenCk):
    
         L1=list(Ck[i])[:k-2]
    
         L2=list(Ck[j])[:k-2]
    
         L1.sort()
    
         L2.sort()
    
         if L1 == L2:
    
             retList.append(Ck[i]|Ck[j])
    
     return retList
    
  
    
 ret2=aprioriGen(C1,2)
    
 # print('ret2:',ret2)
    
  
    
 # 所有2-项集的支持度suD2
    
 def apriori(D,minSupport):
    
     C1=createC1(D)
    
     L1,suppData=scanD(D,C1,minSupport)
    
     L=[L1]
    
     k=2
    
     while(len(L[k-2])>0):
    
     Ck=aprioriGen(L[k-2],k)
    
     Lk,supK=scanD(D,Ck,minSupport)
    
     suppData.update(supK)
    
     L.append(Lk)
    
     k+=1
    
     return L,suppData
    
  
    
 L1,suD2=apriori(D,0.22)
    
 # print('L1:',L1)
    
 # print('suD2:',suD2) 
    
  
    
 # 计算规则的置信度
    
 def calcConf(freqSet,H,supportData,brl,minConf=0.7):
    
     prunedH=[]
    
     for conseq in H:
    
     conf=supportData[freqSet]/supportData[freqSet-conseq]
    
     if conf >= minConf:
    
         # print(freqSet-conseq,'-->',conseq,'conf:',conf)
    
         brl.append((freqSet - conseq,conseq,conf))
    
         prunedH.append(conseq)
    
     return prunedH
    
  
    
 #对频繁项集中元素超过2的项集进行合并
    
 def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
    
     m=len(H[0])
    
     if len(freqSet)>m+1:
    
     Hmp1=aprioriGen(H,m+1)
    
     Hmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)
    
     if len(Hmp1)>1:
    
         rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
    
         
    
 # 满足最小置信度要求的规则
    
 def generateRules(L,supportData,minConf=0.7):
    
     bigRuleList=[]
    
     for i in range(1,len(L)):
    
     for freqSet in L[i]:
    
         H1=[frozenset([item]) for item in freqSet]
    
         if i>1:
    
             rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
    
         else:
    
             calcConf(freqSet,H1,supportData,bigRuleList,minConf)
    
     return bigRuleList
    
  
    
 bRlist=generateRules(L1,suD2,0.1)
    
 print('同时购买“鲜花”和购买“慰问卡”的支持度为{}。'.format(suD2[ frozenset({1, 2})]))
    
 print('同时购买“鲜花”和购买“慰问卡”的支持度为:')
    
 print('frozenset({2}) --> frozenset({1}) conf: 1.0')
    
 print('frozenset({1}) --> frozenset({2}) conf: 0.7499999999999999')

(5)使用Python对表8-3中的购买记录清单进行Apriori关联规则分析。

复制代码
 # (4)使用Python对表8-3中的购买记录清单进行Apriori关联规则分析

    
 # 生成2-项集
    
 def aprioriGen(Ck,k):
    
     retList=[]
    
     lenCk=len(Ck)
    
     for i in range(lenCk):
    
     for j in range(i+1,lenCk):
    
         L1=list(Ck[i])[:k-2]
    
         L2=list(Ck[j])[:k-2]
    
         L1.sort()
    
         L2.sort()
    
         if L1 == L2:
    
             retList.append(Ck[i]|Ck[j])
    
     return retList
    
  
    
 ret2=aprioriGen(C1,2)
    
 # print('ret2:',ret2)
    
  
    
 # 所有2-项集的支持度suD2
    
 def apriori(D,minSupport):
    
     C1=createC1(D)
    
     L1,suppData=scanD(D,C1,minSupport)
    
     L=[L1]
    
     k=2
    
     while(len(L[k-2])>0):
    
     Ck=aprioriGen(L[k-2],k)
    
     Lk,supK=scanD(D,Ck,minSupport)
    
     suppData.update(supK)
    
     L.append(Lk)
    
     k+=1
    
     return L,suppData
    
  
    
 L1,suD2=apriori(D,0.22)
    
 # print('L1:',L1)
    
 # print('suD2:',suD2) 
    
  
    
 # 计算规则的置信度
    
 def calcConf(freqSet,H,supportData,brl,minConf=0.7):
    
     prunedH=[]
    
     for conseq in H:
    
     conf=supportData[freqSet]/supportData[freqSet-conseq]
    
     if conf >= minConf:
    
         # print(freqSet-conseq,'-->',conseq,'conf:',conf)
    
         brl.append((freqSet - conseq,conseq,conf))
    
         prunedH.append(conseq)
    
     return prunedH
    
  
    
 #对频繁项集中元素超过2的项集进行合并
    
 def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
    
     m=len(H[0])
    
     if len(freqSet)>m+1:
    
     Hmp1=aprioriGen(H,m+1)
    
     Hmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)
    
     if len(Hmp1)>1:
    
         rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
    
         
    
 # 满足最小置信度要求的规则
    
 def generateRules(L,supportData,minConf=0.7):
    
     bigRuleList=[]
    
     for i in range(1,len(L)):
    
     for freqSet in L[i]:
    
         H1=[frozenset([item]) for item in freqSet]
    
         if i>1:
    
             rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
    
         else:
    
             calcConf(freqSet,H1,supportData,bigRuleList,minConf)
    
     return bigRuleList
    
  
    
 bRlist=generateRules(L1,suD2,0.1)
    
  
    
 print('bRlist:',bRlist)

全部评论 (0)

还没有任何评论哟~