方法1:
import jieba
f ="紅樓夢.txt"
sf ="停用詞.txt"
f1=open(f,'r',encoding='utf-8')
txt=jieba.lcut(f1.read())
f2=open(sf,'r',encoding='utf-8')
lines=f2.readlines()
ty=[]#存放停用詞for line in lines:ty.append(line[:-1])#去掉行尾換行符
txt0=[]#存放剔除停用詞后的紅樓夢文本for x in txt:if x notin ty:txt0.append(x)
d={}for word in txt0:iflen(word)<=1:continueelif word =='鳳姐兒'or word =='鳳丫頭':rword ='鳳姐'elif word =='二爺'or word =='寶二爺':rword ='寶玉'elif word =='顰兒'or word =='林妹妹'or word =='黛玉道':rword ='黛玉'elif word =='寶丫頭':rword ='寶釵'elif word =='老祖宗':rword ='賈母'elif word =='襲人道':rword ='襲人'elif word =='賈政道':rword ='賈政'elif word =='璉二爺':rword ='賈璉'else:rword=wordd[rword]=d.get(rword,0)+1
ls =list(d.items())
ls.sort(key=lambda x: x[1], reverse=True)
fo=open(r'result.csv','a', encoding='utf-8')for i in ls:if i[1]>=40:print("{},{}".format(i[0],i[1]))fo.write("{},{}\n".format(i[0],i[1]))
f1.close()
f2.close()
fo.close()
方法2:
import jieba
f ="紅樓夢.txt"
sf ="停用詞.txt"
txt = jieba.lcut(open(f,'r', encoding='utf-8').read())# open函數(shù)讀取紅樓夢文本并分詞 ,正式考試可以不用指定編碼,用系統(tǒng)默認(rèn)。 f.read()讀入全部內(nèi)容。jieba.lcut()返回一個列表類型的分詞結(jié)果。
stop_words =[]withopen(sf,'r', encoding='utf-8')as f:# 讀取停用詞文本并分割文本后添加到stop_words列表中。with語句打開文件,好處是讀取文件后自動關(guān)閉,不需要手動關(guān)閉。for i in f.read().splitlines():#str.splitlines([keepends]):返回一個列表,分割符為('\r','\r\n','\n')即按行分割。默認(rèn)參數(shù)keepends為False,意思是不保留每行結(jié)尾的'\n',反之保留。stop_words.append(i)# 剔除停用詞
txt0 =[x for x in txt if x notin stop_words]# 統(tǒng)計詞頻
counts ={}for word in txt0:iflen(word)==1:# 跳過標(biāo)點符號和字continueelif word =='鳳姐兒'or word =='鳳丫頭':rword ='鳳姐'elif word =='二爺'or word =='寶二爺':rword ='寶玉'elif word =='顰兒'or word =='林妹妹'or word =='黛玉道':rword ='黛玉'elif word =='寶丫頭':rword ='寶釵'elif word =='老祖宗':rword ='賈母'elif word =='襲人道':rword ='襲人'elif word =='賈政道':rword ='賈政'elif word =='璉二爺':rword ='賈璉'else:rword = wordcounts[rword]= counts.get(rword,0)+1# 固定語句將字典的值進行排序
li =list(counts.items())
li.sort(key=lambda x: x[1], reverse=True)# 列出詞頻超過40的結(jié)果withopen('result.csv','a', encoding='gbk')as f:for i in li:key, value = iif value <40:breakf.write(key +','+str(value)+'\n')#value為intprint(key +','+str(value))