推荐系统学习(一)--电影推荐系统搭建
1. 推薦系統的架構
本文采用的數據集來源于Netflix用戶電影評分數據,實現一個簡單的基于用戶的協同過濾推薦系統,其中采用皮爾遜系數衡量兩個用戶之間的相似度。
數據集地址
使用到的數據文件:
2. 數據的預處理
由于數據量過大,這里僅選擇原數據集中的1000個用戶及其評分數據進行推薦算法的簡單實現,否則在單機上難以運行(僅1000個用戶數據的處理時間已經達到了數十分鐘)。
首先選擇1000個用戶:
然后加載評分數據信息并分割訓練集和測試集:
# 加載并拆分數據def _load_and_split_data(self):train = dict()test = dict()if os.path.exists("data/train.json") and os.path.exists("data/test.json"):print("從文件中加載數據集")train = json.load(open("data/train.json"))test = json.load(open("data/test.json"))print("數據加載完成")else:i=0random.seed(self.seed)for file in os.listdir(self.file_path):one_path = "{}/{}".format(self.file_path, file)print("{}".format(one_path))with open(one_path, "r") as fp:movieID = fp.readline().split(":")[0]print("movie ID:"+movieID)for line in fp.readlines():if line.strip().endswith(":"):movieID = line.split(":")[0]print("movie ID:"+movieID)continueuserID, rate, _ = line.split(",")if(userID in self.some_users):if random.randint(1,50) == 1:test.setdefault(userID,{})[movieID] = int(rate)else:train.setdefault(userID, {})[movieID] = int(rate)print("加載數據到 data/train.json data/test/json")json.dump(train,open("data/train.json","w"))json.dump(test,open("data/test.json","w"))print("數據加載完成")return train,test3. 計算用戶相似度
這里采用皮爾遜系數進行計算,采用其近似計算如下:
r′=∑i=1nxiyi?∑i=1nxi∑i=1nyin∑i=1nxi2?(∑i=1nxi)2n∑i=1nyi2?(∑i=1nyi)2nr'=\frac{\sum_{i=1}^{n}x_iy_i-\frac{\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{n}} {\sqrt{\sum_{i=1}^{n}x_i^2- \frac{ \left( \sum_{i=1}^{n}x_i \right)^2 }{n}} \sqrt{\sum_{i=1}^{n}y_i^2- \frac{ \left( \sum_{i=1}^{n}y_i \right)^2 }{n} }}r′=∑i=1n?xi2??n(∑i=1n?xi?)2??∑i=1n?yi2??n(∑i=1n?yi?)2??∑i=1n?xi?yi??n∑i=1n?xi?∑i=1n?yi???
4. 基于協同過濾進行推薦
即采用KNN尋找用戶user的k個近鄰并進行排序,并選擇其中評分較高的n個電影,推薦給當前用戶即可。
def recommend(self,userID):neighborUser = dict()for user in self.train.keys():if userID != user:distance = self.pearson(self.train[userID], self.train[user])neighborUser[user] = distancenewNU = sorted(neighborUser.items(), key= lambda k:k[1],reverse=True)movies = dict()for (sim_user,sim) in newNU[:self.k]:for movieID in self.train[sim_user].keys():movies.setdefault(movieID,0)movies[movieID] += sim * self.train[sim_user][movieID]newMovies = sorted(movies.items(),key=lambda k:k[1],reverse=True)return newMovies代碼附錄:
import os import json import random import mathclass FisrtRec:"""初始化函數file_path: 數據文件路徑seed: 隨機數種子k: 選取的近鄰個數n_items: 推薦的電影數量"""def __init__(self, file_path, seed, k, n_items):self.file_path = file_pathself.seed = seedself.k = kself.n_items = n_itemsself.some_users = self.__selectSomeUsers()self.train,self.test = self._load_and_split_data()def __selectSomeUsers(self):print("隨機選擇1000個用戶")if os.path.exists("data/train.json") and os.path.exists("data/test.json"):return list()else:users = set()for file in os.listdir(self.file_path):one_path = "{}/{}".format(self.file_path, file)print("{}".format(one_path))with open(one_path, "r") as fp:for line in fp.readlines():if line.strip().endswith(":"):continueuserID,_,_ = line.split(",")users.add(userID)some_users = random.sample(list(users),1000)print(some_users)return some_users# 加載并拆分數據def _load_and_split_data(self):train = dict()test = dict()if os.path.exists("data/train.json") and os.path.exists("data/test.json"):print("從文件中加載數據集")train = json.load(open("data/train.json"))test = json.load(open("data/test.json"))print("數據加載完成")else:i=0random.seed(self.seed)for file in os.listdir(self.file_path):one_path = "{}/{}".format(self.file_path, file)print("{}".format(one_path))with open(one_path, "r") as fp:movieID = fp.readline().split(":")[0]print("movie ID:"+movieID)for line in fp.readlines():if line.strip().endswith(":"):movieID = line.split(":")[0]print("movie ID:"+movieID)continueuserID, rate, _ = line.split(",")if(userID in self.some_users):if random.randint(1,50) == 1:test.setdefault(userID,{})[movieID] = int(rate)else:train.setdefault(userID, {})[movieID] = int(rate)print("加載數據到 data/train.json data/test/json")json.dump(train,open("data/train.json","w"))json.dump(test,open("data/test.json","w"))print("數據加載完成")return train,testdef pearson(self, rating1, rating2):sum_xy = 0sum_x = 0sum_y = 0sum_x2 = 0sum_y2 = 0num = 0for key in rating1.keys():if key in rating2.keys():num += 1x = rating1[key]y = rating2[key]sum_xy += x * ysum_x += xsum_y += ysum_x2 += math.pow(x,2)sum_y2 += math.pow(y,2)if num == 0:return 0denominator = math.sqrt( sum_x2 - math.pow(sum_x,2) / num) * math.sqrt( sum_y2 - math.pow(sum_y,2) / num)if denominator == 0:return 0else:return (sum_xy - (sum_x * sum_y) / num) / denominatordef recommend(self,userID):neighborUser = dict()for user in self.train.keys():if userID != user:distance = self.pearson(self.train[userID], self.train[user])neighborUser[user] = distancenewNU = sorted(neighborUser.items(), key= lambda k:k[1],reverse=True)movies = dict()for (sim_user,sim) in newNU[:self.k]:for movieID in self.train[sim_user].keys():movies.setdefault(movieID,0)movies[movieID] += sim * self.train[sim_user][movieID]newMovies = sorted(movies.items(),key=lambda k:k[1],reverse=True)return newMoviesdef evaluate(self,num=100):print("評估準確率")precisions = list()random.seed(10)for userID in random.sample(self.test.keys(),num):hit = 0result = self.recommend(userID)[:self.n_items]for(item,rate) in result:if item in self.test[userID]:hit+=1precisions.append(hit/self.n_items)return sum(precisions) / precisions.__len__()if __name__ == "__main__":file_path = "C:\\Users\\Mr.Throne\\Desktop\\推薦系統\\archive\\data"seed = 30k = 10n_items = 5f_rec = FisrtRec(file_path,seed,k,n_items)# result = f_rec.recommend("968796")print("算法推薦準確率:{}".format(f_rec.evaluate()))總結
以上是生活随笔為你收集整理的推荐系统学习(一)--电影推荐系统搭建的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 告别花瓶:2015年智能电视路在何方?
- 下一篇: jquery动画 -- 1.加载指示器