result={}for i inrange(len(file_txt)):left, rights = i,file_txt.iloc[i]['cut_review'].split()for right in rights:if right in result.keys():result[right].append(left)else:result[right]=[left]
result={}for i inrange(len(file_txt.head())):left, rights = i,file_txt.iloc[i]['cut_review'].split()for right in rights:if right in result.keys():result[right].append(left)else:result[right]=[left]print(result)
sentence="中性點接地方式有哪些"
clean_reviewyonghu=remove_punctuation(sentence)#去除標點
cut_reviewyonghu=[w for w inlist(jieba.cut(clean_reviewyonghu))if w notin stopwords andlen(w)>1]#去除停用詞,單字詞#print(cut_reviewyonghu)# ['中性點', '接地', '方式']
Problem_Id=[]for j in cut_reviewyonghu:if j in result.keys():Problem_Id.extend(result[j])id=(list(set(Problem_Id)))#去重之后的IDprint(id)
#余弦相識度計算方法defcosine_similarity(sentence1:str, sentence2:str)->float:""":param sentence1: s:param sentence2::return: 兩句文本的相識度"""seg1 =[word for word in jieba.cut(sentence1)if word notin stopwords]seg2 =[word for word in jieba.cut(sentence2)if word notin stopwords]word_list =list(set([word for word in seg1 + seg2]))#建立詞庫word_count_vec_1 =[]word_count_vec_2 =[]for word in word_list:word_count_vec_1.append(seg1.count(word))#文本1統(tǒng)計在詞典里出現(xiàn)詞的次數(shù)word_count_vec_2.append(seg2.count(word))#文本2統(tǒng)計在詞典里出現(xiàn)詞的次數(shù)vec_1 = np.array(word_count_vec_1)vec_2 = np.array(word_count_vec_2)#余弦公式num = vec_1.dot(vec_2.T)denom = np.linalg.norm(vec_1)* np.linalg.norm(vec_2)cos = num / denomsim =0.5+0.5* cosreturn simstr1=sentence#用戶所提問題
similarity={}#存儲結(jié)果iflen(id)==0:print('數(shù)據(jù)庫里沒有該問題,請重新提問')else:for i inid:str2 = file_txt.iloc[i]['問題']sim1 = cosine_similarity(str1, str2)# 余弦相識度print('用戶所提問題和問題{0}的相似度是{1}'.format(i, sim1))similarity[i]= sim1
print(similarity)
第八步:給出答案 將第七步得到的similarity={} 進行排序,輸出相似度最高的2個問題答案
jieguo=sorted(similarity.items(),key=lambda d:d[1],reverse=True)[:2]#降序print(jieguo)print('用戶所提的問題是:',sentence)for i,j in jieguo:print('數(shù)據(jù)庫相似的問題是{0} 答案是{1}'.format(i,file_txt.iloc[i]['答案']))
答案如下:可以發(fā)問題33的答案是我們要找的答案
完美 整理后的總代碼
import pandas as pdimport numpy as np
import jieba
import re# 定義刪除除字母,數(shù)字,漢字以外的所有符號的函數(shù)defremove_punctuation(line):line =str(line)if line.strip()=='':return''rule = re.compile(u"[^a-zA-Z0-9\u4E00-\u9FA5]")line = rule.sub('', line)return line#停用詞defstopwordslist(filepath):stopwords =[line.strip()for line inopen(filepath,'r', encoding='gbk').readlines()]return stopwords#余弦相識度計算方法defcosine_similarity(sentence1:str, sentence2:str,stopwords)->float:""":param sentence1: s:param sentence2::return: 兩句文本的相識度"""seg1 =[word for word in jieba.cut(sentence1)if word notin stopwords ]seg2 =[word for word in jieba.cut(sentence2)if word notin stopwords ]word_list =list(set([word for word in seg1 + seg2]))#建立詞庫word_count_vec_1 =[]word_count_vec_2 =[]for word in word_list:word_count_vec_1.append(seg1.count(word))#文本1統(tǒng)計在詞典里出現(xiàn)詞的次數(shù)word_count_vec_2.append(seg2.count(word))#文本2統(tǒng)計在詞典里出現(xiàn)詞的次數(shù)vec_1 = np.array(word_count_vec_1)vec_2 = np.array(word_count_vec_2)#余弦公式num = vec_1.dot(vec_2.T)denom = np.linalg.norm(vec_1)* np.linalg.norm(vec_2)cos = num / denomsim =0.5+0.5* cosreturn simdefmain():#讀取數(shù)據(jù)csv ='電力調(diào)度問答.csv'file_txt = pd.read_csv(csv, header=0, encoding='gbk')# [205 rows x 2 columns]file_txt = file_txt.dropna()# 刪除空值[[205 rows x 2 columns]#停用詞加載stopwords = stopwordslist("停用詞.txt")# 去除標點符號file_txt['clean_review']= file_txt['問題'].apply(remove_punctuation)# 去除停用詞file_txt['cut_review']= file_txt['clean_review'].apply(lambda x:" ".join([w for w inlist(jieba.cut(x))if w notin stopwords andlen(w)>1]))#所有問題組合起來的倒排表 resultresult ={}for i inrange(len(file_txt)):left, rights = i, file_txt.iloc[i]['cut_review'].split()for right in rights:if right in result.keys():result[right].append(left)else:result[right]=[left]#用戶問題sentence=input('請輸入問題:')clean_reviewyonghu = remove_punctuation(sentence)# 用戶問題去除標點cut_reviewyonghu =[w for w inlist(jieba.cut(clean_reviewyonghu))ifw notin stopwords andlen(w)>1]# 用戶問題去除停用詞,單字詞 得到關(guān)鍵詞#print(cut_reviewyonghu)#查找用戶問題關(guān)鍵詞在數(shù)據(jù)庫中對應(yīng)的問題idProblem_Id =[]for j in cut_reviewyonghu:if j in result.keys():Problem_Id.extend(result[j])id=(list(set(Problem_Id)))# 去重之后的ID#計算余弦相似度str1 = sentence # 用戶所提問題similarity ={}# 存儲結(jié)果iflen(id)==0:print('數(shù)據(jù)庫里沒有該問題,請重新提問')else:for i inid:str2 = file_txt.iloc[i]['問題']sim1 = cosine_similarity(str1, str2,stopwords)# 余弦相識度# print('用戶所提問題和問題{0}的相似度是{1}'.format(i, sim1))similarity[i]= sim1#輸出和用戶問題相似度最高幾個問題的答案jieguo =sorted(similarity.items(), key=lambda d: d[1], reverse=True)[:2]# 降序print(jieguo)print('用戶所提的問題是:', sentence)for i, j in jieguo:print('數(shù)據(jù)庫相似的問題是{0} 答案是{1}'.format(i, file_txt.iloc[i]['答案']))if __name__=='__main__':main()