python idf_python计算idf
#-*- encoding:utf-8 -*-
import jieba
import jieba.analyse
import json
import codecs
import math
'''
計算得到idf文件
求idf得步驟:
1、對所有文檔進行分詞,去停用詞,結果放入二維list,其中每個元素是set
1、得到文檔數目;生成所有詞的set
2、對每個詞計算idf:idf = log(n / docs(w, D))
'''
def loadData(path):
'''
加載數據,解析json格式
:param path:
:return:
'''
json_str = codecs.open(path, 'r', 'utf8').read()
json_data = json.loads(json_str)
return json_data
def seg(content, stopwords):
'''
分詞并去除停用詞
'''
segs = jieba.cut(content, cut_all=True)
segs = [w.encode('utf8') for w in list(segs)]# 特別注意此處轉換
seg_set = set(set(segs) - set(stopwords))
return seg_set
def docs(w, D):
c = 0
for d in D:
if w in d:
c = c + 1;
return c
def save(idf_dict, path):
f = file(path, "a+")
f.truncate()
# write_list = []
for key in idf_dict.keys():
# write_list.append(str(key)+" "+str(idf_dict[key]))
f.write(str(key) + " " + str(idf_dict[key]) + "\n")
f.close()
def compute_idf(json_data, stopwords):
# 所有分詞后文檔
D = []
#所有詞的set
W = set()
for i in range(len(json_data)):
#新聞原始數據
prevue = json_data[i]["prevue"]
d = seg(prevue, stopwords)
D.append(d)
W = W | d
#計算idf
idf_dict = {}
n = len(W)
#idf = log(n / docs(w, D))
for w in list(W):
idf = math.log(n*1.0 / docs(w, D))
idf_dict[w] = idf
return idf_dict
path = 'D:/dev_data/yuetu_tag/news_tag/id_tag_news'
json_data = loadData(path)
#獲取停用詞
stopwords = {}.fromkeys([ line.rstrip() for line in open("../extra_dict/stop_words.txt") ])
#得到idf的字典
idf_dict = compute_idf(json_data, stopwords)
#存儲
path = "../extra_dict/idf.txt"
save(idf_dict, path)
總結
以上是生活随笔為你收集整理的python idf_python计算idf的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python列透视_python – 在
- 下一篇: 弹性碰撞后速度方向_找准模型,快速求解碰