生活随笔
收集整理的這篇文章主要介紹了
LogisticRegression针对中文文本分类
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
改編自博客:
http://blog.csdn.net/github_36326955/article/details/54891204
做個筆記
代碼按照1 2 3 4的順序進行即可:
1.py(corpus_segment.py)
[python] view plaincopy
?? ?? ? ? ? ? ? ? ? ?? import ?sys??import ?os??import ?jieba???? reload(sys)?? sys.setdefaultencoding('utf-8' )?? ?? def ?savefile(savepath,?content):??????with?open(savepath,?"wb" )?as?fp:?? ????????fp.write(content)?? ????'' ? ? ? ? ?? ?? def ?readfile(path):??????with?open(path,?"rb" )?as?fp:?? ????????content?=?fp.read()?? ????return ?content?? ?? def ?corpus_segment(corpus_path,?seg_path):??????'' ? ? ? ?? ????catelist?=?os.listdir(corpus_path)???? ????'' ? ? ? ?? ?? ?????? ????for ?mydir?in ?catelist:?? ????????'' ? ? ?? ????????class_path?=?corpus_path?+?mydir?+?"/" ???? ????????seg_dir?=?seg_path?+?mydir?+?"/" ???? ?? ????????if ?not ?os.path.exists(seg_dir):???? ????????????os.makedirs(seg_dir)?? ?? ????????file_list?=?os.listdir(class_path)???? ????????'' ? ? ? ? ? ? ? ?? ????????for ?file_path?in ?file_list:???? ????????????fullname?=?class_path?+?file_path???? ????????????content?=?readfile(fullname)???? ????????????'' ? ? ?? ????????????content?=?content.replace("\r\n" ,?"")???? ????????????content?=?content.replace("?" ,?"")?? ????????????content_seg?=?jieba.cut(content)???? ????????????savefile(seg_dir?+?file_path,?"?" .join(content_seg))???? ?? ????print ?"中文語料分詞結束!!!" ?? ?? '' ?? ? ? ? ? ? ? ?? if ?__name__=="__main__" :???????? ????corpus_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/" ???? ????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/" ???? ????corpus_segment(corpus_path,seg_path)?? ?? ?????? ????corpus_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/" ???? ????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/" ???? ????corpus_segment(corpus_path,seg_path)??
2.py(corpus2Bunch.py)
[python] view plaincopy
?? ?? ? ? ? ? ? ? ? ?? import ?sys??reload(sys)?? sys.setdefaultencoding('utf-8' )?? import ?os??import ?cPickle?as?pickle??'' ?? ? ? ? ? ?? from ?sklearn.datasets.base?import ?Bunch???? ?? ?? ?? def ?_readfile(path):??????'' ?? ?????? ?????? ?????? ?????? ????with?open(path,?"rb" )?as?fp:?? ????????content?=?fp.read()?? ????return ?content?? ?? def ?corpus2Bunch(wordbag_path,seg_path):??????catelist?=?os.listdir(seg_path)?? ?????? ????bunch?=?Bunch(target_name=[],?label=[],?filenames=[],?contents=[])?? ????bunch.target_name.extend(catelist)?? ????'' ? ? ? ?? ?????? ????for ?mydir?in ?catelist:?? ????????class_path?=?seg_path?+?mydir?+?"/" ???? ????????file_list?=?os.listdir(class_path)???? ????????for ?file_path?in ?file_list:???? ????????????fullname?=?class_path?+?file_path???? ????????????bunch.label.append(mydir)?? ????????????bunch.filenames.append(fullname)?? ????????????bunch.contents.append(_readfile(fullname))???? ????????????'' ?? ?????? ????with?open(wordbag_path,?"wb" )?as?file_obj:?? ????????pickle.dump(bunch,?file_obj)?? ????print ?"構建文本對象結束!!!" ?? ?? if ?__name__?==?"__main__" :???????? ????wordbag_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat" ???? ????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/" ???? ????corpus2Bunch(wordbag_path,?seg_path)?? ?? ?????? ????wordbag_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat" ???? ????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/" ???? ????corpus2Bunch(wordbag_path,?seg_path)??
3.py(TFIDF_space.py)
[python] view plaincopy
?? ?? ? ? ? ? ? ? ? ?? import ?sys??reload(sys)?? sys.setdefaultencoding('utf-8' )?? ?? from ?sklearn.datasets.base?import ?Bunch??import ?cPickle?as?pickle??from ?sklearn.feature_extraction.text?import ?TfidfVectorizer???? def ?_readfile(path):??????with?open(path,?"rb" )?as?fp:?? ????????content?=?fp.read()?? ????return ?content?? ?? def ?_readbunchobj(path):??????with?open(path,?"rb" )?as?file_obj:?? ????????bunch?=?pickle.load(file_obj)?? ????return ?bunch?? ?? def ?_writebunchobj(path,?bunchobj):??????with?open(path,?"wb" )?as?file_obj:?? ????????pickle.dump(bunchobj,?file_obj)?? ?? def ?vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None ):???? ????stpwrdlst?=?_readfile(stopword_path).splitlines()?? ????bunch?=?_readbunchobj(bunch_path)?? ????tfidfspace?=?Bunch(target_name=bunch.target_name,?label=bunch.label,?filenames=bunch.filenames,?tdm=[],?vocabulary={})?? ?? ????if ?train_tfidf_path?is ?not ?None :?? ????????trainbunch?=?_readbunchobj(train_tfidf_path)?? ????????tfidfspace.vocabulary?=?trainbunch.vocabulary?? ????????vectorizer?=?TfidfVectorizer(stop_words=stpwrdlst,?sublinear_tf=True ,?max_df=0.5 ,vocabulary=trainbunch.vocabulary)?? ????????tfidfspace.tdm?=?vectorizer.fit_transform(bunch.contents)?? ?? ????else :?? ????????vectorizer?=?TfidfVectorizer(stop_words=stpwrdlst,?sublinear_tf=True ,?max_df=0.5 )?? ????????tfidfspace.tdm?=?vectorizer.fit_transform(bunch.contents)?? ????????tfidfspace.vocabulary?=?vectorizer.vocabulary_?? ?? ????_writebunchobj(space_path,?tfidfspace)?? ????print ?"tf-idf詞向量空間實例創建成功!!!" ?? ?? if ?__name__?==?'__main__' :???? ?????? ?????? ?????? ?????? ?????? ?????? ?????? ?????? ?????? ?? ????stopword_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt" ?? ?? ????train_bunch_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat" ?? ????space_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat" ?? ????vector_space(stopword_path,train_bunch_path,space_path)?? ?? ????train_tfidf_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat" ???? ????test_bunch_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat" ?? ????test_space_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat" ?? ?? ????vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path)?
4.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@version: python2.7.8
@author: XiangguoSun
@contact: sunxiangguodut@qq.com
@file: NBayes_Predict.py
@time: 2017/2/8 12:21
@software: PyCharm
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')import cPickle as pickle
from sklearn.naive_bayes import MultinomialNB # 導入多項式貝葉斯算法# 讀取bunch對象
def _readbunchobj(path):with open(path, "rb") as file_obj:bunch = pickle.load(file_obj)return bunch# 導入訓練集
trainpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/tfidfspace.dat"
train_set = _readbunchobj(trainpath)# 導入測試集
testpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath)from sklearn.linear_model.logistic import LogisticRegression
clf=LogisticRegression()
clf.fit(train_set.tdm, train_set.label)# 預測分類結果
predicted = clf.predict(test_set.tdm)for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):if flabel != expct_cate:print file_name,": 實際類別:",flabel," -->預測類別:",expct_cateprint "預測完畢!!!"# 計算分類精度:
from sklearn import metrics
def metrics_result(actual, predict):print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))metrics_result(test_set.label, predicted)
依然使用復旦大學的新聞數據集
運行結果(這里復制一部分):
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics13.txt : 實際類別: C16-Electronics? -->預測類別: C19-Computer /home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics50.txt : 實際類別: C16-Electronics? -->預測類別: C34-Economy /home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics48.txt : 實際類別: C16-Electronics? -->預測類別: C34-Economy 預測完畢!!! /home/appleyuchi/.virtualenvs/python2.7/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. ? 'precision', 'predicted', average, warn_for) 精度:0.860 召回:0.885 f1-score:0.862 /home/appleyuchi/.virtualenvs/python2.7/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples. ? 'precision', 'predicted', average, warn_for) Process finished with exit code 0
總結
以上是生活随笔 為你收集整理的LogisticRegression针对中文文本分类 的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔 網站內容還不錯,歡迎將生活随笔 推薦給好友。