【知识发现】天池平台新浪微博互动预测-ItemCF推荐方法
生活随笔
收集整理的這篇文章主要介紹了
【知识发现】天池平台新浪微博互动预测-ItemCF推荐方法
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1、比賽內容:
https://tianchi.aliyun.com/getStart/introduction.htm?spm=5176.100066.0.0.62a0c916DZRdDr&raceId=231574
2、解題思路:將轉發、評論、贊次數作為物品推薦給博文(預先用結巴分詞抽取關鍵詞作為博文標簽)
3、參考代碼:
1)主程序
# -*- coding: utf-8 -*- ''' Created on 2017年10月31日@author: Jason.F ''' import time import pandas as pd from sklearn import cross_validation import jieba import jieba.analyse from ItemCF import ItemCF class cWeibo:def __init__(self,path):self.path=pathdef importData(self):path=self.path#導入樣本集data=pd.read_csv(path+'\\weibo_train_data.txt',encoding='utf8',sep='\t',names=['luid','mid','time','fcs','ccs','lcs','cont'])#nrows=1000data['fcs']=data['fcs'].astype('int')#博文發表一周后的轉發數,權重0.5data['ccs']=data['ccs'].astype('int')#博文發表一周后的評論數,權重0.25data['lcs']=data['lcs'].astype('int')#博文發表一周后的點贊數,權重0.25train,test=cross_validation.train_test_split(data,test_size=0.2)self.traindata = pd.DataFrame(data)#全量訓練self.testdata = pd.DataFrame(test)#測試集print '訓練集,有:', self.traindata.shape[0], '行', self.traindata.shape[1], '列'print '測試集,有:', self.testdata.shape[0], '行', self.testdata.shape[1], '列'#導入預測集data=pd.read_csv(path+'\\weibo_predict_data.txt',encoding='utf8',sep='\t',names=['luid','mid','time','cont'])#nrows=100self.predata=data #預測集print '預測集,有:', self.predata.shape[0], '行', self.predata.shape[1], '列'def ETL(self):'''#uid映射為數字編號ut_train=set(self.traindata.ix[:,0])ut_pred=set(self.predata.ix[:,0])ut=list(ut_train.symmetric_difference(ut_pred))#取并集并去重df_ut=pd.DataFrame(ut,columns=['luid'])df_ut['uid']=df_ut.indexself.traindata=pd.merge(self.traindata,df_ut, on=['luid'], how='left')self.traindata=self.traindata[['uid','time','fcs','ccs','lcs','cont']]self.testdata=pd.merge(self.testdata,df_ut, on=['luid'], how='left')self.testdata=self.testdata[['uid','time','fcs','ccs','lcs','cont']]self.predata=pd.merge(self.predata,df_ut, on=['luid'], how='left')self.predata=self.predata[['luid','mid','uid','time','cont']]'''#time轉換成0-23數字,必要下要考慮特殊節日self.traindata['time']=self.traindata.apply(lambda x:(time.strptime(x['time'],"%Y-%m-%d %H:%M:%S")).tm_hour,axis=1)self.traindata.rename(columns=lambda x:x.replace('time','tid'), inplace=True)#修改列名為tidself.testdata['time']=self.testdata.apply(lambda x:(time.strptime(x['time'],"%Y-%m-%d %H:%M:%S")).tm_hour,axis=1)self.testdata.rename(columns=lambda x:x.replace('time','tid'), inplace=True)self.predata['time']=self.predata.apply(lambda x:(time.strptime(x['time'],"%Y-%m-%d %H:%M:%S")).tm_hour,axis=1)self.predata.rename(columns=lambda x:x.replace('time','tid'), inplace=True) #cont分詞,文本內容要考慮帶@和紅包的特殊意義詞#jieba.suggest_freq('@', True)self.traindata['cont']=self.traindata.apply(lambda x:",".join(jieba.analyse.extract_tags(x['cont'],topK=50,\allowPOS=('n','nr','ns','nt','nz','a','ad','an','f','s','i','t','v','vd','vn'))),axis=1)self.traindata=self.traindata.drop('cont', axis=1).join(self.traindata['cont'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('tag'))self.testdata['cont']=self.testdata.apply(lambda x:",".join(jieba.analyse.extract_tags(x['cont'],topK=50,\allowPOS=('n','nr','ns','nt','nz','a','ad','an','f','s','i','t','v','vd','vn'))),axis=1)self.testdata=self.testdata.drop('cont', axis=1).join(self.testdata['cont'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('tag'))self.predata['cont']=self.predata.apply(lambda x:",".join(jieba.analyse.extract_tags(x['cont'],topK=50,\allowPOS=('n','nr','ns','nt','nz','a','ad','an','f','s','i','t','v','vd','vn'))),axis=1)self.predata=self.predata.drop('cont', axis=1).join(self.predata['cont'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('tag'))#生成標簽表ft_train=set(self.traindata.ix[:,6])ft_pred=set(self.predata.ix[:,3])ft=list(ft_train.symmetric_difference(ft_pred))#取并集并去重df_ft=pd.DataFrame(ft,columns=['tag'])df_ft['fid']=df_ft.indexself.traindata=pd.merge(self.traindata,df_ft, on=['tag'], how='left')self.traindata=self.traindata[['luid','mid','tid','fid','fcs','ccs','lcs']]self.traindata=self.traindata.dropna(axis=0,how='any') self.traindata['fid']=self.traindata['fid'].astype('int') self.testdata=pd.merge(self.testdata,df_ft, on=['tag'], how='left')self.testdata=self.testdata[['luid','mid','tid','fid','fcs','ccs','lcs']]self.testdata=self.testdata.dropna(axis=0,how='any') self.testdata['fid']=self.testdata['fid'].astype('int') self.predata=pd.merge(self.predata,df_ft, on=['tag'], how='left')self.predata=self.predata[['luid','mid','tid','fid']]self.predata=self.predata.dropna(axis=0,how='any')self.predata['fid']=self.predata['fid'].astype('int')def callItemCF(self):data=self.traindata #data['u']=data.apply(lambda x:str(x['uid'])+':'+x['tid']+':'+str(x['fid']),axis=1)data_f=data[['fid','fcs','tid']]data_c=data[['fid','ccs','tid']]data_l=data[['fid','lcs','tid']]#訓練轉發次數推薦模型ic_f = ItemCF(data_f)ic_f.user_item()#轉化成dict,并生成訓練集和測試集ic_f.ItemSimilarity()#生成物品相似度矩陣self.ic_f=ic_f#訓練評論次數推薦模型ic_c = ItemCF(data_c)ic_c.user_item()#轉化成dict,并生成訓練集和測試集ic_c.ItemSimilarity()#生成物品相似度矩陣self.ic_c=ic_c#訓練點贊次數推薦模型ic_l = ItemCF(data_l)ic_l.user_item()#轉化成dict,并生成訓練集和測試集ic_l.ItemSimilarity()#生成物品相似度矩陣self.ic_l=ic_ltest=self.testdatatest_f=test[['mid','fid','tid','fcs']]test_c=test[['mid','fid','tid','ccs']]test_l=test[['mid','fid','tid','lcs']]test_f['pfcs']=test_f.apply(lambda x: ((ic_f.recommend(x['fid'])).keys())[0] if len((ic_f.recommend(x['fid'])).keys()) else 0,axis=1)test_c['pccs']=test_c.apply(lambda x: ((ic_c.recommend(x['fid'])).keys())[0] if len((ic_c.recommend(x['fid'])).keys()) else 0,axis=1)test_l['plcs']=test_l.apply(lambda x: ((ic_l.recommend(x['fid'])).keys())[0] if len((ic_l.recommend(x['fid'])).keys()) else 0,axis=1)#計算準確率precision=self.precision (test_f[['mid','fcs','pfcs']],test_c[['mid','ccs','pccs']],test_l[['mid','lcs','plcs']])print (precision)def precision(self,test_f,test_c,test_l):#轉發、評論、點贊次數預測偏差test_f=test_f.groupby('mid').mean()test_c=test_c.groupby('mid').mean()test_l=test_l.groupby('mid').mean()test_f['dev_f']=test_f.apply(lambda x:abs(x['fcs']-int(x['pfcs']))/(x['fcs']+5),axis=1)test_c['dev_c']=test_c.apply(lambda x:abs(x['ccs']-int(x['pccs']))/(x['ccs']+3),axis=1)test_l['dev_l']=test_l.apply(lambda x:abs(x['lcs']-int(x['plcs']))/(x['lcs']+3),axis=1)#print(test_f.columns.tolist)test=test_ftest=pd.merge(test,test_c,left_index=True,right_index=True)#以索引連接test=pd.merge(test,test_l,left_index=True,right_index=True)test['prec']=test.apply(lambda x:(1-0.5*x['dev_f']-0.25*x['dev_c']-0.25*x['dev_l']),axis=1)test['count']=test.apply(lambda x: 100 if (x['pfcs']+x['pccs']+x['plcs']>100) else (x['pfcs']+x['pccs']+x['plcs']),axis=1)test['sgn']=test.apply(lambda x:1 if (x['prec']-0.8) >0 else 0,axis=1)test['on']=test.apply(lambda x:(x['count']+1)*x['sgn'],axis=1)test['down']=test.apply(lambda x:(x['count']+1),axis=1)prec_df=test[['on','down']]#prec_df.loc['Row_sum'] = prec_df.apply(lambda x: x.sum())#on=prec_df.get_value(index='Row_num',col='on',takeable=False)on=prec_df['on'].sum()down=prec_df['down'].sum()return (on/down)def predict(self):'''文件命名:weibo_result_data.txtuid、mid、forward_count字段以tab鍵分隔forward_count、comment_count、like_count字段間以逗號分隔'''predata=self.predata[['luid','mid','tid','fid']]predata['pfcs']=predata.apply(lambda x:((self.ic_f.recommend(x['fid'])).keys())[0] if len((self.ic_f.recommend(x['fid'])).keys()) else 0,axis=1)predata['pccs']=predata.apply(lambda x:((self.ic_c.recommend(x['fid'])).keys())[0] if len((self.ic_c.recommend(x['fid'])).keys()) else 0,axis=1)predata['plcs']=predata.apply(lambda x:((self.ic_l.recommend(x['fid'])).keys())[0] if len((self.ic_l.recommend(x['fid'])).keys()) else 0,axis=1)print (predata)if __name__ == "__main__": start = time.clock() wb=cWeibo('D:\\CVTE\\weibo')wb.importData()#導入數據wb.ETL()#特征抽取wb.callItemCF()#推薦算法wb.predict()end = time.clock() print ('finish all in %s' % str(end - start))2)基于時間上下文的ItemCF推薦算法:time設置為1,可以自己處理后傳入 # -*- coding: utf-8 -*- ''' Created on 2017年10月31日 基于時間上下文的物品協同過濾推薦算法 @author: Administrator '''import math import numpy as np import timeclass ItemCF:def __init__(self,data):self.data = datadef user_item(self):#訓練集轉化成dictdata=self.dataself.traindata = {} for user,item,time in np.array(data).tolist():self.traindata.setdefault(user,{})self.traindata[user][item] =1#time def ItemSimilarity(self,alpha=1):train=self.traindata#calculate co-rated users between itemsC=dict()N=dict()for u,items in train.items():for i,tui in items.items():N.setdefault(i,0)N[i]+=1for j,tuj in items.items():C.setdefault(i,{})if i==j:continueC[i].setdefault(j,0)C[i][j]+=1/(1+alpha*abs(tui-tuj))#calculate finial similarity matrix WW=dict()for i,related_items in C.items():W.setdefault(i,{})for j, cij in related_items.items():W[i].setdefault(j,0)W[i][j]=cij/math.sqrt(N[i]*N[j]*1.0)self.itemSim=W def recommend(self,user,tid=1,k=1,beta=1):W=self.itemSimtrain=self.traindatarank=dict()ru=train.get(user,{})for i,tui in ru.items():for j,wj in W[i].items(): if j in ru.keys():continuerank.setdefault(j,0)rank[j]+=wj/(1+beta*abs(tid-tui))return dict(sorted(rank.items(), key = lambda x:x[1], reverse = True)[0:k])if __name__ == "__main__": start = time.clock() end = time.clock() print('finish all in %s' % str(end - start))
預測準確率比較低,主要是:
1)博文標簽通過結巴分詞標簽化后,博文之間的關聯性降低了,考慮用textrank或lsa方法;
2)對時間元素如何在推薦算法中體現需要進一步考慮;
3)用戶在這里沒有參與模型,這個缺失比較嚴重。
重點是研發基于時間上下文的ItemCF算法,其他思路待考慮。
總結
以上是生活随笔為你收集整理的【知识发现】天池平台新浪微博互动预测-ItemCF推荐方法的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【数据平台】pandas将一列中的文本拆
- 下一篇: 【数据平台】python语言NLP库Ge