• Python推荐模块
    • 新的数据集
    • 项目实践

    Python推荐模块

    我将本章学到的内容都汇集成了一个Python类,虽然代码有些长,我还是贴在了这里:

    1. import codecs
    2. from math import sqrt
    3. users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
    4. "Norah Jones": 4.5, "Phoenix": 5.0,
    5. "Slightly Stoopid": 1.5,
    6. "The Strokes": 2.5, "Vampire Weekend": 2.0},
    7. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
    8. "Deadmau5": 4.0, "Phoenix": 2.0,
    9. "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
    10. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
    11. "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
    12. "Slightly Stoopid": 1.0},
    13. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
    14. "Deadmau5": 4.5, "Phoenix": 3.0,
    15. "Slightly Stoopid": 4.5, "The Strokes": 4.0,
    16. "Vampire Weekend": 2.0},
    17. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
    18. "Norah Jones": 4.0, "The Strokes": 4.0,
    19. "Vampire Weekend": 1.0},
    20. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0,
    21. "Norah Jones": 5.0, "Phoenix": 5.0,
    22. "Slightly Stoopid": 4.5, "The Strokes": 4.0,
    23. "Vampire Weekend": 4.0},
    24. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
    25. "Norah Jones": 3.0, "Phoenix": 5.0,
    26. "Slightly Stoopid": 4.0, "The Strokes": 5.0},
    27. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
    28. "Phoenix": 4.0, "Slightly Stoopid": 2.5,
    29. "The Strokes": 3.0}
    30. }
    31. class recommender:
    32. def __init__(self, data, k=1, metric='pearson', n=5):
    33. """ 初始化推荐模块
    34. data 训练数据
    35. k K邻近算法中的值
    36. metric 使用何种距离计算方式
    37. n 推荐结果的数量
    38. """
    39. self.k = k
    40. self.n = n
    41. self.username2id = {}
    42. self.userid2name = {}
    43. self.productid2name = {}
    44. # 将距离计算方式保存下来
    45. self.metric = metric
    46. if self.metric == 'pearson':
    47. self.fn = self.pearson
    48. #
    49. # 如果data是一个字典类型,则保存下来,否则忽略
    50. #
    51. if type(data).__name__ == 'dict':
    52. self.data = data
    53. def convertProductID2name(self, id):
    54. """通过产品ID获取名称"""
    55. if id in self.productid2name:
    56. return self.productid2name[id]
    57. else:
    58. return id
    59. def userRatings(self, id, n):
    60. """返回该用户评分最高的物品"""
    61. print ("Ratings for " + self.userid2name[id])
    62. ratings = self.data[id]
    63. print(len(ratings))
    64. ratings = list(ratings.items())
    65. ratings = [(self.convertProductID2name(k), v)
    66. for (k, v) in ratings]
    67. # 排序并返回结果
    68. ratings.sort(key=lambda artistTuple: artistTuple[1],
    69. reverse = True)
    70. ratings = ratings[:n]
    71. for rating in ratings:
    72. print("%s\t%i" % (rating[0], rating[1]))
    73. def loadBookDB(self, path=''):
    74. """加载BX数据集,path是数据文件位置"""
    75. self.data = {}
    76. i = 0
    77. #
    78. # 将书籍评分数据放入self.data
    79. #
    80. f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')
    81. for line in f:
    82. i += 1
    83. #separate line into fields
    84. fields = line.split(';')
    85. user = fields[0].strip('"')
    86. book = fields[1].strip('"')
    87. rating = int(fields[2].strip().strip('"'))
    88. if user in self.data:
    89. currentRatings = self.data[user]
    90. else:
    91. currentRatings = {}
    92. currentRatings[book] = rating
    93. self.data[user] = currentRatings
    94. f.close()
    95. #
    96. # 将书籍信息存入self.productid2name
    97. # 包括isbn号、书名、作者等
    98. #
    99. f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')
    100. for line in f:
    101. i += 1
    102. #separate line into fields
    103. fields = line.split(';')
    104. isbn = fields[0].strip('"')
    105. title = fields[1].strip('"')
    106. author = fields[2].strip().strip('"')
    107. title = title + ' by ' + author
    108. self.productid2name[isbn] = title
    109. f.close()
    110. #
    111. # 将用户信息存入self.userid2name和self.username2id
    112. #
    113. f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')
    114. for line in f:
    115. i += 1
    116. #print(line)
    117. #separate line into fields
    118. fields = line.split(';')
    119. userid = fields[0].strip('"')
    120. location = fields[1].strip('"')
    121. if len(fields) > 3:
    122. age = fields[2].strip().strip('"')
    123. else:
    124. age = 'NULL'
    125. if age != 'NULL':
    126. value = location + ' (age: ' + age + ')'
    127. else:
    128. value = location
    129. self.userid2name[userid] = value
    130. self.username2id[location] = userid
    131. f.close()
    132. print(i)
    133. def pearson(self, rating1, rating2):
    134. sum_xy = 0
    135. sum_x = 0
    136. sum_y = 0
    137. sum_x2 = 0
    138. sum_y2 = 0
    139. n = 0
    140. for key in rating1:
    141. if key in rating2:
    142. n += 1
    143. x = rating1[key]
    144. y = rating2[key]
    145. sum_xy += x * y
    146. sum_x += x
    147. sum_y += y
    148. sum_x2 += pow(x, 2)
    149. sum_y2 += pow(y, 2)
    150. if n == 0:
    151. return 0
    152. # 计算分母
    153. denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)
    154. * sqrt(sum_y2 - pow(sum_y, 2) / n))
    155. if denominator == 0:
    156. return 0
    157. else:
    158. return (sum_xy - (sum_x * sum_y) / n) / denominator
    159. def computeNearestNeighbor(self, username):
    160. """获取邻近用户"""
    161. distances = []
    162. for instance in self.data:
    163. if instance != username:
    164. distance = self.fn(self.data[username],
    165. self.data[instance])
    166. distances.append((instance, distance))
    167. # 按距离排序,距离近的排在前面
    168. distances.sort(key=lambda artistTuple: artistTuple[1],
    169. reverse=True)
    170. return distances
    171. def recommend(self, user):
    172. """返回推荐列表"""
    173. recommendations = {}
    174. # 首先,获取邻近用户
    175. nearest = self.computeNearestNeighbor(user)
    176. #
    177. # 获取用户评价过的商品
    178. #
    179. userRatings = self.data[user]
    180. #
    181. # 计算总距离
    182. totalDistance = 0.0
    183. for i in range(self.k):
    184. totalDistance += nearest[i][1]
    185. # 汇总K邻近用户的评分
    186. for i in range(self.k):
    187. # 计算饼图的每个分片
    188. weight = nearest[i][1] / totalDistance
    189. # 获取用户名称
    190. name = nearest[i][0]
    191. # 获取用户评分
    192. neighborRatings = self.data[name]
    193. # 获得没有评价过的商品
    194. for artist in neighborRatings:
    195. if not artist in userRatings:
    196. if artist not in recommendations:
    197. recommendations[artist] = (neighborRatings[artist]
    198. * weight)
    199. else:
    200. recommendations[artist] = (recommendations[artist]
    201. + neighborRatings[artist]
    202. * weight)
    203. # 开始推荐
    204. recommendations = list(recommendations.items())
    205. recommendations = [(self.convertProductID2name(k), v)
    206. for (k, v) in recommendations]
    207. # 排序并返回
    208. recommendations.sort(key=lambda artistTuple: artistTuple[1],
    209. reverse = True)
    210. # 返回前n个结果
    211. return recommendations[:self.n]

    运行示例

    首先构建一个推荐类,然后获取推荐结果:

    1. >>> r = recommender(users)
    2. >>> r.recommend('Jordyn')
    3. [('Blues Traveler', 5.0)]
    4. >>> r.recommend('Hailey')
    5. [('Phoenix', 5.0), ('Slightly Stoopid', 4.5)]

    新的数据集

    现在让我们使用一个更为真实的数据集。Cai-Nicolas Zeigler从图书漂流站收集了超过100万条评价数据——278,858位用户为271,379本书打了分。

    这份数据(匿名)可以从这个地址获得,有SQL和CSV两种格式。由于特殊符号的关系,这些数据无法直接加载到Python里。

    我做了一些清洗,可以从这里下载。

    CSV文件包含了三张表:

    • 用户表,包括用户ID、位置、年龄等信息。其中用户的姓名已经隐去;
    • 书籍表,包括ISBN号、标题、作者、出版日期、出版社等;
    • 评分表,包括用户ID、书籍ISBN号、以及评分(0-10分)。

    上文Python代码中的loadBookDB方法可以加载这些数据,用法如下:

    1. >>> r.loadBookDB('/Users/raz/Downloads/BX-Dump/')
    2. 1700018
    3. >>> r.recommend('171118')

    注意 由于数据集比较大,大约需要几十秒的时间加载和查询。

    项目实践

    只有运行调试过书中的代码后才能真正掌握这些方法,以下是一些实践建议:

    1. 实现一个计算曼哈顿距离和欧几里得距离的方法;
    2. 本书的网站上有一个包含25部电影评价的数据集,实现一个推荐算法。