Advertisement

数据挖掘实践指南读书笔记2

阅读量:
640?wx_fmt=png

1. 写在之前

本书所包含的源程序与数据资料均可通过以下链接获取:http://guidetodatamining.com/该书的理论基础较为浅显,并且其中的错误相对较少。此外,实践练习部分较为丰富。若自行完成所有代码实现,则能获得较为显著的学习成果。综上所述,该书适合作为新手学习使用。如需转载,请注明出处,并对内容中的任何问题持开放态度。
合集的具体位置如下:https://www.zybuluo.com/hainingwyx/note/559139

2. 基于物品的协同过滤

显示评级:显示给出评级结果(如Youtube的点赞、点差按钮隐式评级)。网站点击轨迹数据隐式地反映了用户的兴趣偏好(即基于邻居用户的推荐系统计算量非常庞大),这不仅存在延迟性问题(还存在稀疏性问题)。此外,在基于内存的方法中需要存储所有的评分记录以进行推荐操作;而在基于物品的方法中则无需存储所有评分信息(而是随时构建一个模型表示各项目之间的相似程度)。为了减少评分膨胀的影响(即避免余弦相似度因评分偏高而失真),需要对余弦相似度进行调整

640?wx_fmt=png

U表示所有同事对i和j进行过评级的用户组合,

640?wx_fmt=png

表示用户u对物品i的评分,

640?wx_fmt=png

表示用户u对所有物品评分的平均值。可以获得相似度矩阵。

复制代码
 users3 = {"David": {"Imagine Dragons": 3, "Daft Punk": 5,

    
                 "Lorde": 4, "Fall Out Boy": 1},
    
       "Matt":  {"Imagine Dragons": 3, "Daft Punk": 4,
    
                 "Lorde": 4, "Fall Out Boy": 1},
    
       "Ben":   {"Kacey Musgraves": 4, "Imagine Dragons": 3,
    
                 "Lorde": 3, "Fall Out Boy": 1},
    
       "Chris": {"Kacey Musgraves": 4, "Imagine Dragons": 4,
    
                 "Daft Punk": 4, "Lorde": 3, "Fall Out Boy": 1},
    
       "Tori":  {"Kacey Musgraves": 5, "Imagine Dragons": 4,
    
                 "Daft Punk": 5, "Fall Out Boy": 3}}
    
  
    
 def computeSimilarity(band1, band2, userRatings):
    
    averages = {}
    
    for (key, ratings) in userRatings.items():
    
       averages[key] = (float(sum(ratings.values()))
    
                   / len(ratings.values()))
    
  
    
    num = 0  # numerator
    
    dem1 = 0 # first half of denominator
    
    dem2 = 0
    
    for (user, ratings) in userRatings.items():
    
       if band1 in ratings and band2 in ratings:
    
      avg = averages[user]
    
      num += (ratings[band1] - avg) * (ratings[band2] - avg)
    
      dem1 += (ratings[band1] - avg)**2
    
      dem2 += (ratings[band2] - avg)**2
    
    return num / (sqrt(dem1) * sqrt(dem2))

相似矩阵预测:

640?wx_fmt=png

p(u,i)表示用户u对物品i的预测值

N表示用户u的所有评级物品中每个和i得分相似的物品。

640?wx_fmt=png

是i和N之间的相识度

640?wx_fmt=png

是u给N的评级结果,应该在[-1, 1]之间取值,可能需要做线性变换

640?wx_fmt=png

得到新的评级结果为

640?wx_fmt=png

3. ScopeOne 算法

计算偏差

物品i到物品j的平均偏差为

640?wx_fmt=png

card(S)是S集合中的元素的个数。X是整个评分集合。

640?wx_fmt=png

是所有对i和j进行评分的用户集合。

复制代码
 def computeDeviations(self):

    
     # for each person in the data:
    
     #    get their ratings
    
     for ratings in self.data.values():        # data:users2, ratings:{song:value, , }
    
     # for each item & rating in that set of ratings:
    
     for (item, rating) in ratings.items():
    
         self.frequencies.setdefault(item, {})   #key is song
    
         self.deviations.setdefault(item, {})                    
    
         # for each item2 & rating2 in that set of ratings:
    
         for (item2, rating2) in ratings.items():
    
             if item != item2:
    
                 # add the difference between the ratings to our
    
                 # computation
    
                 self.frequencies[item].setdefault(item2, 0)
    
                 self.deviations[item].setdefault(item2, 0.0)
    
                 # frequemcies is card
    
                 self.frequencies[item][item2] += 1    
    
                 # diviations is the sum of dev of diff users
    
                 #value of complex dic is dev
    
                 self.deviations[item][item2] += rating - rating2
    
  
    
                 for (item, ratings) in self.deviations.items():
    
                     for item2 in ratings:
    
                         ratings[item2] /= self.frequencies[item][item2]
    
 # test code for ComputeDeviations(self)
    
 #r = recommender(users2)
    
 #r.computeDeviations()
    
 #r.deviations

加权Slope预测

640?wx_fmt=png

表示加权Slope算法给出的用户u对物品j的预测

复制代码
 def slopeOneRecommendations(self, userRatings):

    
     recommendations = {}
    
     frequencies = {}
    
     # for every item and rating in the user's recommendations
    
     for (userItem, userRating) in userRatings.items():        # userItem :i
    
     # for every item in our dataset that the user didn't rate
    
     for (diffItem, diffRatings) in self.deviations.items():    #diffItem : j
    
         if diffItem not in userRatings and \
    
         userItem in self.deviations[diffItem]:
    
             freq = self.frequencies[diffItem][userItem] #freq:c_ji
    
             # 如果键不存在于字典中,将会添加键并将值设为默认值。
    
             recommendations.setdefault(diffItem, 0.0)
    
             frequencies.setdefault(diffItem, 0)
    
             # add to the running sum representing the numerator
    
             # of the formula
    
             recommendations[diffItem] += (diffRatings[userItem] +
    
                                           userRating) * freq
    
             # keep a running sum of the frequency of diffitem
    
             frequencies[diffItem] += freq
    
             #p(u)j list
    
             recommendations =  [(self.convertProductID2name(k),          
    
                                  v / frequencies[k])
    
                                 for (k, v) in recommendations.items()]
    
             # finally sort and return
    
             recommendations.sort(key=lambda artistTuple: artistTuple[1],
    
                                  reverse = True)
    
             # I am only going to return the first 50 recommendations
    
             return recommendations[:50]
    
        
    
 # test code for SlopeOneRecommendations
    
 #r = recommender(users2)
    
 #r.computeDeviations()
    
 #g = users2['Ben']
    
 #r.slopeOneRecommendations(g)
复制代码
 def loadMovieLens(self, path=''):

    
       self.data = {}
    
       #
    
       # first load movie ratings
    
       #
    
       i = 0
    
       #
    
       # First load book ratings into self.data
    
       #
    
       #f = codecs.open(path + "u.data", 'r', 'utf8')
    
       f = codecs.open(path + "u.data", 'r', 'ascii')
    
       #  f = open(path + "u.data")
    
       for line in f:
    
      i += 1
    
      #separate line into fields
    
      fields = line.split('\t')
    
      user = fields[0]
    
      movie = fields[1]
    
      rating = int(fields[2].strip().strip('"'))
    
      if user in self.data:
    
         currentRatings = self.data[user]
    
      else:
    
         currentRatings = {}
    
      currentRatings[movie] = rating
    
      self.data[user] = currentRatings
    
       f.close()
    
       #
    
       # Now load movie into self.productid2name
    
       # the file u.item contains movie id, title, release date among
    
       # other fields
    
       #
    
       #f = codecs.open(path + "u.item", 'r', 'utf8')
    
       f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore')
    
       #f = open(path + "u.item")
    
       for line in f:
    
      i += 1
    
      #separate line into fields
    
      fields = line.split('|')
    
      mid = fields[0].strip()
    
      title = fields[1].strip()
    
      self.productid2name[mid] = title
    
       f.close()
    
       #
    
       #  Now load user info into both self.userid2name
    
       #  and self.username2id
    
       #
    
       #f = codecs.open(path + "u.user", 'r', 'utf8')
    
       f = open(path + "u.user")
    
       for line in f:
    
      i += 1
    
      fields = line.split('|')
    
      userid = fields[0].strip('"')
    
      self.userid2name[userid] = line
    
      self.username2id[line] = userid
    
       f.close()
    
       print(i)
    
 # test code
    
 #r = recommender(0)
    
 #r.loadMovieLens('ml-100k/')
    
 #r.computeDeviations()
    
 #r.slopeOneRecommendations(r.data['1'])
    
 #r.slopeOneRecommendations(r.data['25'])

4. 我的公众号

640?wx_fmt=png

全部评论 (0)

还没有任何评论哟~