當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

随机森林针对中文文本分类

發布時間：2023/12/20 编程问答 25 豆豆

生活随笔收集整理的這篇文章主要介紹了随机森林针对中文文本分类小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

改編自博客：

http://blog.csdn.net/github_36326955/article/details/54891204

做個筆記

代碼按照1 2 3 4的順序進行即可：

1.py(corpus_segment.py)

[python]?view plain?copy

#!/usr/bin/env?python??

#?-*-?coding:?UTF-8?-*-??

"""?

@version:?python2.7.8??

@author:?XiangguoSun?

@contact:?sunxiangguodut@qq.com?

@file:?corpus_segment.py?

@time:?2017/2/5?15:28?

@software:?PyCharm?

"""??

import?sys??

import?os??

import?jieba??

#?配置utf-8輸出環境??

reload(sys)??

sys.setdefaultencoding('utf-8')??

#?保存至文件??

def?savefile(savepath,?content):??

????with?open(savepath,?"wb")?as?fp:??

????????fp.write(content)??

????'''''?

????上面兩行是python2.6以上版本增加的語法，省略了繁瑣的文件close和try操作?

????2.5版本需要from?__future__?import?with_statement?

????新手可以參考這個鏈接來學習http://zhoutall.com/archives/325?

????'''??

#?讀取文件??

def?readfile(path):??

????with?open(path,?"rb")?as?fp:??

????????content?=?fp.read()??

????return?content??

def?corpus_segment(corpus_path,?seg_path):??

????'''''?

????corpus_path是未分詞語料庫路徑?

????seg_path是分詞后語料庫存儲路徑?

????'''??

????catelist?=?os.listdir(corpus_path)??#?獲取corpus_path下的所有子目錄??

????'''''?

????其中子目錄的名字就是類別名，例如：?

????train_corpus/art/21.txt中，'train_corpus/'是corpus_path，'art'是catelist中的一個成員?

????'''??

????#?獲取每個目錄（類別）下所有的文件??

????for?mydir?in?catelist:??

????????'''''?

????????這里mydir就是train_corpus/art/21.txt中的art（即catelist中的一個類別）?

????????'''??

????????class_path?=?corpus_path?+?mydir?+?"/"??#?拼出分類子目錄的路徑如：train_corpus/art/??

????????seg_dir?=?seg_path?+?mydir?+?"/"??#?拼出分詞后存貯的對應目錄路徑如：train_corpus_seg/art/??

????????if?not?os.path.exists(seg_dir):??#?是否存在分詞目錄，如果沒有則創建該目錄??

????????????os.makedirs(seg_dir)??

????????file_list?=?os.listdir(class_path)??#?獲取未分詞語料庫中某一類別中的所有文本??

????????'''''?

????????train_corpus/art/中的?

????????21.txt,?

????????22.txt,?

????????23.txt?

????????...?

????????file_list=['21.txt','22.txt',...]?

????????'''??

????????for?file_path?in?file_list:??#?遍歷類別目錄下的所有文件??

????????????fullname?=?class_path?+?file_path??#?拼出文件名全路徑如：train_corpus/art/21.txt??

????????????content?=?readfile(fullname)??#?讀取文件內容??

????????????'''''此時，content里面存貯的是原文本的所有字符，例如多余的空格、空行、回車等等，?

????????????接下來，我們需要把這些無關痛癢的字符統統去掉，變成只有標點符號做間隔的緊湊的文本內容?

????????????'''??

????????????content?=?content.replace("\r\n",?"")??#?刪除換行??

????????????content?=?content.replace("?",?"")#刪除空行、多余的空格??

????????????content_seg?=?jieba.cut(content)??#?為文件內容分詞??

????????????savefile(seg_dir?+?file_path,?"?".join(content_seg))??#?將處理后的文件保存到分詞后語料目錄??

????print?"中文語料分詞結束！！！"??

'''''?

如果你對if?__name__=="__main__":這句不懂，可以參考下面的文章?

http://imoyao.lofter.com/post/3492bc_bd0c4ce?

簡單來說如果其他python文件調用這個文件的函數，或者把這個文件作為模塊?

導入到你的工程中時，那么下面的代碼將不會被執行，而如果單獨在命令行中?

運行這個文件，或者在IDE（如pycharm）中運行這個文件時候，下面的代碼才會運行。?

即，這部分代碼相當于一個功能測試。?

如果你還沒懂，建議你放棄IT這個行業。?

'''??

if?__name__=="__main__":??

????#對訓練集進行分詞??

????corpus_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/"??#?未分詞分類語料庫路徑??

????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"??#?分詞后分類語料庫路徑,本程序輸出結果??

????corpus_segment(corpus_path,seg_path)??

????#對測試集進行分詞??

????corpus_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/"??#?未分詞分類語料庫路徑??

????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"??#?分詞后分類語料庫路徑，本程序輸出結果??

????corpus_segment(corpus_path,seg_path)??

2.py(corpus2Bunch.py) [python]?view plain?copy

#!/usr/bin/env?python??

#?-*-?coding:?UTF-8?-*-??

"""?

@version:?python2.7.8??

@author:?XiangguoSun?

@contact:?sunxiangguodut@qq.com?

@file:?corpus2Bunch.py?

@time:?2017/2/7?7:41?

@software:?PyCharm?

"""??

import?sys??

reload(sys)??

sys.setdefaultencoding('utf-8')??

import?os#python內置的包，用于進行文件目錄操作，我們將會用到os.listdir函數??

import?cPickle?as?pickle#導入cPickle包并且取一個別名pickle??

'''''?

事實上python中還有一個也叫作pickle的包，與這里的名字相同了，無所謂?

關于cPickle與pickle，請參考博主另一篇博文：?

python核心模塊之pickle和cPickle講解?

http://blog.csdn.net/github_36326955/article/details/54882506?

本文件代碼下面會用到cPickle中的函數cPickle.dump?

'''??

from?sklearn.datasets.base?import?Bunch??

#這個您無需做過多了解，您只需要記住以后導入Bunch數據結構就像這樣就可以了。??

#今后的博文會對sklearn做更有針對性的講解??

def?_readfile(path):??

????'''''讀取文件'''??

????#函數名前面帶一個_,是標識私有函數??

????#?僅僅用于標明而已，不起什么作用，??

????#?外面想調用還是可以調用，??

????#?只是增強了程序的可讀性??

????with?open(path,?"rb")?as?fp:#with?as句法前面的代碼已經多次介紹過，今后不再注釋??

????????content?=?fp.read()??

????return?content??

def?corpus2Bunch(wordbag_path,seg_path):??

????catelist?=?os.listdir(seg_path)#?獲取seg_path下的所有子目錄，也就是分類信息??

????#創建一個Bunch實例??

????bunch?=?Bunch(target_name=[],?label=[],?filenames=[],?contents=[])??

????bunch.target_name.extend(catelist)??

????'''''?

????extend(addlist)是python?list中的函數，意思是用新的list（addlist）去擴充?

????原來的list?

????'''??

????#?獲取每個目錄下所有的文件??

????for?mydir?in?catelist:??

????????class_path?=?seg_path?+?mydir?+?"/"??#?拼出分類子目錄的路徑??

????????file_list?=?os.listdir(class_path)??#?獲取class_path下的所有文件??

????????for?file_path?in?file_list:??#?遍歷類別目錄下文件??

????????????fullname?=?class_path?+?file_path??#?拼出文件名全路徑??

????????????bunch.label.append(mydir)??

????????????bunch.filenames.append(fullname)??

????????????bunch.contents.append(_readfile(fullname))??#?讀取文件內容??

????????????'''''append(element)是python?list中的函數，意思是向原來的list中添加element，注意與extend()函數的區別'''??

????#?將bunch存儲到wordbag_path路徑中??

????with?open(wordbag_path,?"wb")?as?file_obj:??

????????pickle.dump(bunch,?file_obj)??

????print?"構建文本對象結束！！！"??

if?__name__?==?"__main__":#這個語句前面的代碼已經介紹過，今后不再注釋??

????#對訓練集進行Bunch化操作：??

????wordbag_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"??#?Bunch存儲路徑，程序輸出??

????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"??#?分詞后分類語料庫路徑，程序輸入??

????corpus2Bunch(wordbag_path,?seg_path)??

????#?對測試集進行Bunch化操作：??

????wordbag_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"??#?Bunch存儲路徑，程序輸出??

????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"??#?分詞后分類語料庫路徑，程序輸入??

????corpus2Bunch(wordbag_path,?seg_path)??

3.py(TFIDF_space.py)

[python]?view plain?copy

#!/usr/bin/env?python??

#?-*-?coding:?UTF-8?-*-??

"""?

@version:?python2.7.8??

@author:?XiangguoSun?

@contact:?sunxiangguodut@qq.com?

@file:?TFIDF_space.py?

@time:?2017/2/8?11:39?

@software:?PyCharm?

"""??

import?sys??

reload(sys)??

sys.setdefaultencoding('utf-8')??

from?sklearn.datasets.base?import?Bunch??

import?cPickle?as?pickle??

from?sklearn.feature_extraction.text?import?TfidfVectorizer??

def?_readfile(path):??

????with?open(path,?"rb")?as?fp:??

????????content?=?fp.read()??

????return?content??

def?_readbunchobj(path):??

????with?open(path,?"rb")?as?file_obj:??

????????bunch?=?pickle.load(file_obj)??

????return?bunch??

def?_writebunchobj(path,?bunchobj):??

????with?open(path,?"wb")?as?file_obj:??

????????pickle.dump(bunchobj,?file_obj)??

def?vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):??

????stpwrdlst?=?_readfile(stopword_path).splitlines()??

????bunch?=?_readbunchobj(bunch_path)??

????tfidfspace?=?Bunch(target_name=bunch.target_name,?label=bunch.label,?filenames=bunch.filenames,?tdm=[],?vocabulary={})??

????if?train_tfidf_path?is?not?None:??

????????trainbunch?=?_readbunchobj(train_tfidf_path)??

????????tfidfspace.vocabulary?=?trainbunch.vocabulary??

????????vectorizer?=?TfidfVectorizer(stop_words=stpwrdlst,?sublinear_tf=True,?max_df=0.5,vocabulary=trainbunch.vocabulary)??

????????tfidfspace.tdm?=?vectorizer.fit_transform(bunch.contents)??

????else:??

????????vectorizer?=?TfidfVectorizer(stop_words=stpwrdlst,?sublinear_tf=True,?max_df=0.5)??

????????tfidfspace.tdm?=?vectorizer.fit_transform(bunch.contents)??

????????tfidfspace.vocabulary?=?vectorizer.vocabulary_??

????_writebunchobj(space_path,?tfidfspace)??

????print?"tf-idf詞向量空間實例創建成功！！！"??

if?__name__?==?'__main__':??

????#?stopword_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#輸入的文件??

????#?bunch_path?=?"train_word_bag/train_set.dat"#輸入的文件??

????#?space_path?=?"train_word_bag/tfdifspace.dat"#輸出的文件??

????#?vector_space(stopword_path,bunch_path,space_path)??

????#??

????#?bunch_path?=?"test_word_bag/test_set.dat"#輸入的文件??

????#?space_path?=?"test_word_bag/testspace.dat"??

????#?train_tfidf_path="train_word_bag/tfdifspace.dat"??

????#?vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)??

????stopword_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#輸入的文件??

????train_bunch_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"#輸入的文件??

????space_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#輸出的文件??

????vector_space(stopword_path,train_bunch_path,space_path)??

????train_tfidf_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"??#?輸入的文件，由上面生成??

????test_bunch_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"#輸入的文件??

????test_space_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat"#輸出的文件??

????vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path)?

4.py

#!/usr/bin/env python # -*- coding: UTF-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import cPickle as pickle from sklearn.naive_bayes import MultinomialNB # 導入多項式貝葉斯算法 # 讀取bunch對象 def _readbunchobj(path): with open(path, "rb") as file_obj: bunch = pickle.load(file_obj) return bunch # 導入訓練集 trainpath = "../train_word_bag/tfidfspace.dat" train_set = _readbunchobj(trainpath) # 導入測試集 testpath = "../test_word_bag/testspace.dat" test_set = _readbunchobj(testpath) # 訓練分類器：輸入詞袋向量和分類標簽，alpha:0.001 alpha越小，迭代次數越多，精度越高 # clf = MultinomialNB(alpha=0.1).fit(train_set.tdm, train_set.label) ###################################################### from sklearn.ensemble import RandomForestClassifier print '*************************隨機森林分類器***********************' clf = RandomForestClassifier(oob_score=True, random_state=10) clf.fit(train_set.tdm, train_set.label) # 預測分類結果 print '*************************開始預測************************' predicted = clf.predict(test_set.tdm) for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted): if flabel != expct_cate: print file_name,": 實際類別:",flabel," -->預測類別:",expct_cate print "預測完畢!!!" # 計算分類精度： from sklearn import metrics def metrics_result(actual, predict): print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted')) print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted')) print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted')) metrics_result(test_set.label, predicted)
依然使用復旦大學的新聞數據集

運行結果（這里復制一部分）：

../test_corpus_seg/C37-Military/C37-Military008.txt : 實際類別: C37-Military? -->預測類別: C31-Enviornment
../test_corpus_seg/C37-Military/C37-Military031.txt : 實際類別: C37-Military? -->預測類別: C38-Politics
../test_corpus_seg/C37-Military/C37-Military105.txt : 實際類別: C37-Military? -->預測類別: C39-Sports
../test_corpus_seg/C37-Military/C37-Military101.txt : 實際類別: C37-Military? -->預測類別: C38-Politics
../test_corpus_seg/C37-Military/C37-Military006.txt : 實際類別: C37-Military? -->預測類別: C38-Politics
../test_corpus_seg/C37-Military/C37-Military125.txt : 實際類別: C37-Military? -->預測類別: C38-Politics
預測完畢!!!
精度:0.786
召回:0.790
f1-score:0.773

總結

以上是生活随笔為你收集整理的随机森林针对中文文本分类的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：决策树（CART算法）针对中文文本分类
下一篇：基于FPGrowth挖掘算法的乳腺癌中医