生活随笔
收集整理的這篇文章主要介紹了
随机森林针对中文文本分类
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
改編自博客:
http://blog.csdn.net/github_36326955/article/details/54891204
做個筆記
代碼按照1 2 3 4的順序進行即可:
1.py(corpus_segment.py)
[python]?view plain
?copy ?? ?? ? ? ? ? ? ? ? ?? import?sys?? import?os?? import?jieba?? ?? reload(sys)?? sys.setdefaultencoding('utf-8')?? ?? def?savefile(savepath,?content):?? ????with?open(savepath,?"wb")?as?fp:?? ????????fp.write(content)?? ????''? ? ? ? ?? ?? def?readfile(path):?? ????with?open(path,?"rb")?as?fp:?? ????????content?=?fp.read()?? ????return?content?? ?? def?corpus_segment(corpus_path,?seg_path):?? ????''? ? ? ?? ????catelist?=?os.listdir(corpus_path)???? ????''? ? ? ?? ?? ?????? ????for?mydir?in?catelist:?? ????????''? ? ?? ????????class_path?=?corpus_path?+?mydir?+?"/"???? ????????seg_dir?=?seg_path?+?mydir?+?"/"???? ?? ????????if?not?os.path.exists(seg_dir):???? ????????????os.makedirs(seg_dir)?? ?? ????????file_list?=?os.listdir(class_path)???? ????????''? ? ? ? ? ? ? ?? ????????for?file_path?in?file_list:???? ????????????fullname?=?class_path?+?file_path???? ????????????content?=?readfile(fullname)???? ????????????''? ? ?? ????????????content?=?content.replace("\r\n",?"")???? ????????????content?=?content.replace("?",?"")?? ????????????content_seg?=?jieba.cut(content)???? ????????????savefile(seg_dir?+?file_path,?"?".join(content_seg))???? ?? ????print?"中文語料分詞結束!!!"?? ?? ''? ? ? ? ? ? ? ? ?? if?__name__=="__main__":?? ?????? ????corpus_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/"???? ????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"???? ????corpus_segment(corpus_path,seg_path)?? ?? ?????? ????corpus_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/"???? ????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"???? ????corpus_segment(corpus_path,seg_path)??
2.py(corpus2Bunch.py)
[python]?view plain
?copy ?? ?? ? ? ? ? ? ? ? ?? import?sys?? reload(sys)?? sys.setdefaultencoding('utf-8')?? import?os?? import?cPickle?as?pickle?? ''? ? ? ? ? ? ?? from?sklearn.datasets.base?import?Bunch?? ?? ?? ?? ?? def?_readfile(path):?? ????''?? ?????? ?????? ?????? ?????? ????with?open(path,?"rb")?as?fp:?? ????????content?=?fp.read()?? ????return?content?? ?? def?corpus2Bunch(wordbag_path,seg_path):?? ????catelist?=?os.listdir(seg_path)?? ?????? ????bunch?=?Bunch(target_name=[],?label=[],?filenames=[],?contents=[])?? ????bunch.target_name.extend(catelist)?? ????''? ? ? ?? ?????? ????for?mydir?in?catelist:?? ????????class_path?=?seg_path?+?mydir?+?"/"???? ????????file_list?=?os.listdir(class_path)???? ????????for?file_path?in?file_list:???? ????????????fullname?=?class_path?+?file_path???? ????????????bunch.label.append(mydir)?? ????????????bunch.filenames.append(fullname)?? ????????????bunch.contents.append(_readfile(fullname))???? ????????????''?? ?????? ????with?open(wordbag_path,?"wb")?as?file_obj:?? ????????pickle.dump(bunch,?file_obj)?? ????print?"構建文本對象結束!!!"?? ?? if?__name__?==?"__main__":?? ?????? ????wordbag_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"???? ????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"???? ????corpus2Bunch(wordbag_path,?seg_path)?? ?? ?????? ????wordbag_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"???? ????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"???? ????corpus2Bunch(wordbag_path,?seg_path)??
3.py(TFIDF_space.py)
[python]?view plain
?copy ?? ?? ? ? ? ? ? ? ? ?? import?sys?? reload(sys)?? sys.setdefaultencoding('utf-8')?? ?? from?sklearn.datasets.base?import?Bunch?? import?cPickle?as?pickle?? from?sklearn.feature_extraction.text?import?TfidfVectorizer?? ?? def?_readfile(path):?? ????with?open(path,?"rb")?as?fp:?? ????????content?=?fp.read()?? ????return?content?? ?? def?_readbunchobj(path):?? ????with?open(path,?"rb")?as?file_obj:?? ????????bunch?=?pickle.load(file_obj)?? ????return?bunch?? ?? def?_writebunchobj(path,?bunchobj):?? ????with?open(path,?"wb")?as?file_obj:?? ????????pickle.dump(bunchobj,?file_obj)?? ?? def?vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):?? ?? ????stpwrdlst?=?_readfile(stopword_path).splitlines()?? ????bunch?=?_readbunchobj(bunch_path)?? ????tfidfspace?=?Bunch(target_name=bunch.target_name,?label=bunch.label,?filenames=bunch.filenames,?tdm=[],?vocabulary={})?? ?? ????if?train_tfidf_path?is?not?None:?? ????????trainbunch?=?_readbunchobj(train_tfidf_path)?? ????????tfidfspace.vocabulary?=?trainbunch.vocabulary?? ????????vectorizer?=?TfidfVectorizer(stop_words=stpwrdlst,?sublinear_tf=True,?max_df=0.5,vocabulary=trainbunch.vocabulary)?? ????????tfidfspace.tdm?=?vectorizer.fit_transform(bunch.contents)?? ?? ????else:?? ????????vectorizer?=?TfidfVectorizer(stop_words=stpwrdlst,?sublinear_tf=True,?max_df=0.5)?? ????????tfidfspace.tdm?=?vectorizer.fit_transform(bunch.contents)?? ????????tfidfspace.vocabulary?=?vectorizer.vocabulary_?? ?? ????_writebunchobj(space_path,?tfidfspace)?? ????print?"tf-idf詞向量空間實例創建成功!!!"?? ?? if?__name__?==?'__main__':?? ?? ?????? ?????? ?????? ?????? ?????? ?????? ?????? ?????? ?????? ?? ????stopword_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"?? ?? ????train_bunch_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"?? ????space_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"?? ????vector_space(stopword_path,train_bunch_path,space_path)?? ?? ????train_tfidf_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"???? ????test_bunch_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"?? ????test_space_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat"?? ?? ????vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path)?
4.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*- import sys
reload(sys)
sys.setdefaultencoding('utf-8') import cPickle as pickle
from sklearn.naive_bayes import MultinomialNB # 導入多項式貝葉斯算法 # 讀取bunch對象
def _readbunchobj(path): with open(path, "rb") as file_obj: bunch = pickle.load(file_obj) return bunch # 導入訓練集
trainpath = "../train_word_bag/tfidfspace.dat"
train_set = _readbunchobj(trainpath) # 導入測試集
testpath = "../test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath) # 訓練分類器:輸入詞袋向量和分類標簽,alpha:0.001 alpha越小,迭代次數越多,精度越高
# clf = MultinomialNB(alpha=0.1).fit(train_set.tdm, train_set.label) ###################################################### from sklearn.ensemble import RandomForestClassifier
print '*************************隨機森林分類器***********************' clf = RandomForestClassifier(oob_score=True, random_state=10)
clf.fit(train_set.tdm, train_set.label) # 預測分類結果 print '*************************開始預測************************'
predicted = clf.predict(test_set.tdm)
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted): if flabel != expct_cate: print file_name,": 實際類別:",flabel," -->預測類別:",expct_cate print "預測完畢!!!" # 計算分類精度:
from sklearn import metrics
def metrics_result(actual, predict): print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted')) print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted')) print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted')) metrics_result(test_set.label, predicted)
依然使用復旦大學的新聞數據集
運行結果(這里復制一部分):
../test_corpus_seg/C37-Military/C37-Military008.txt : 實際類別: C37-Military? -->預測類別: C31-Enviornment
../test_corpus_seg/C37-Military/C37-Military031.txt : 實際類別: C37-Military? -->預測類別: C38-Politics
../test_corpus_seg/C37-Military/C37-Military105.txt : 實際類別: C37-Military? -->預測類別: C39-Sports
../test_corpus_seg/C37-Military/C37-Military101.txt : 實際類別: C37-Military? -->預測類別: C38-Politics
../test_corpus_seg/C37-Military/C37-Military006.txt : 實際類別: C37-Military? -->預測類別: C38-Politics
../test_corpus_seg/C37-Military/C37-Military125.txt : 實際類別: C37-Military? -->預測類別: C38-Politics
預測完畢!!!
精度:0.786
召回:0.790
f1-score:0.773
總結
以上是生活随笔為你收集整理的随机森林针对中文文本分类的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。