电影推荐_亲和性分析_规则提取(数据挖掘入门与实践-实验6)
生活随笔
收集整理的這篇文章主要介紹了
电影推荐_亲和性分析_规则提取(数据挖掘入门与实践-实验6)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
#數據導入
import os
import pandas as pdall_ratings=pd.read_csv("ml-100k/u.data", delimiter="\t", header=None, names=["UserID", "MovieID", "Rating", "Datetime"])
all_ratings["Datatime"]=pd.to_datetime(all_ratings["Datetime"], unit='s')
all_ratings=all_ratings[["UserID", "MovieID", "Rating", "Datatime"]]
#all_ratings[:5]#是否電影喜愛特征添加
all_ratings["Favorable"]=all_ratings["Rating"]>3
#all_ratings[:15]#UserID<=200的數據截取
ratings_200=all_ratings[all_ratings["UserID"].isin(range(200))]
#print(ratings_200)#用戶喜愛電影提取
favorable_ratings = ratings_200[ratings_200["Favorable"]]
#print(favorable_ratings)
favorable_reviews_by_users=dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
#print(favorable_reviews_by_users)#最受歡迎電影提取
nums_favorable_by_movie=ratings_200[["MovieID", "Favorable"]].groupby("MovieID").sum()
#nums_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]####################Aprior算法#頻繁集字典
frequent_itemsets = {}#最小支持度
min_support=50#備選項集生成
frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"]) for movie_id,row in nums_favorable_by_movie.iterrows() if row["Favorable"]>min_support)
#print(frequent_itemsets[1])#備選集檢測
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users,k_l_itemsets, min_support):counts=defaultdict(int)for user,reviews in favorable_reviews_by_users.items():for itemset in k_l_itemsets:if itemset.issubset(reviews):for other_reviewed_movie in reviews - itemset: current_superset = itemset | frozenset((other_reviewed_movie,)) counts[current_superset] += 1return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])#循環創建,運行Apriori
import sys
for k in range(2, 20): cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support) frequent_itemsets[k] = cur_frequent_itemsets#檢測頻繁集項數if len(cur_frequent_itemsets) == 0: #print("Did not find any frequent itemsets of length {}". format(k)) sys.stdout.flush() breakelse: #print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k)) sys.stdout.flush()del frequent_itemsets[1]#規則生成
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items(): for itemset in itemset_counts.keys(): for conclusion in itemset: premise = itemset - set((conclusion,)) candidate_rules.append((premise, conclusion))
candidate_rules[:5]#規則置信度計算
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)for user, reviews in favorable_reviews_by_users.items(): for candidate_rule in candidate_rules: premise, conclusion = candidate_ruleif premise.issubset(reviews):if conclusion in reviews: correct_counts[candidate_rule] += 1 else: incorrect_counts[candidate_rule] += 1rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5): #print("Rule #{0}".format(index + 1)) (premise, conclusion) = sorted_confidence[index][0]#print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion)) #print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)])) #print("")#ml-100k/u.item
#u.items數據加載
movie_name_data = pd.read_csv("ml-100k/u.item", delimiter="|",header=None, encoding = "mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]def get_movie_name(movie_id):title_object = movie_name_data[movie_name_data["MovieID"] ==movie_id]["Title"]title = title_object.values[0]return titlefor index in range(5): #print("Rule #{0}".format(index + 1)) (premise, conclusion) = sorted_confidence[index][0] premise_names = ", ".join(get_movie_name(idx) for idx in premise) conclusion_name = get_movie_name(conclusion) #print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name)) #print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
#print("")#training && testtest_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("UserID")["MovieID"])correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items(): for candidate_rule in candidate_rules: premise, conclusion = candidate_rule if premise.issubset(reviews): if conclusion in reviews: correct_counts[candidate_rule] += 1 else: incorrect_counts[candidate_rule] += 1test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts [candidate_rule]) for candidate_rule in rule_confidence}for index in range(5): #print("Rule #{0}".format(index + 1)) (premise, conclusion) = sorted_confidence[index][0] premise_names = ", ".join(get_movie_name(idx) for idx in premise) conclusion_name = get_movie_name(conclusion) #print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name)) #print(" - Train Confidence: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1))) #print(" - Test Confidence: {0:.3f}".format(test_confidence.get((premise, conclusion), -1)))
#print("")
總結
以上是生活随笔為你收集整理的电影推荐_亲和性分析_规则提取(数据挖掘入门与实践-实验6)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Leetcode题库 119.杨辉三角(
- 下一篇: 机器学习 KNN算法_0_丐版_鸢尾花集