生活随笔
收集整理的這篇文章主要介紹了
SVM针对中文文本分类
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
改編自博客:
http://blog.csdn.net/github_36326955/article/details/54891204
做個筆記
代碼按照1 2 3 4的順序進行即可:
1.py(corpus_segment.py)
[python] view plaincopy
?????????????import?sys??import?os??import?jieba????reload(sys)??sys.setdefaultencoding('utf-8')????def?savefile(savepath,?content):??????with?open(savepath,?"wb")?as?fp:??????????fp.write(content)??????''????????def?readfile(path):??????with?open(path,?"rb")?as?fp:??????????content?=?fp.read()??????return?content????def?corpus_segment(corpus_path,?seg_path):??????''?????????catelist?=?os.listdir(corpus_path)????????''?????????????????for?mydir?in?catelist:??????????''????????????class_path?=?corpus_path?+?mydir?+?"/"????????????seg_dir?=?seg_path?+?mydir?+?"/"??????????????if?not?os.path.exists(seg_dir):????????????????os.makedirs(seg_dir)????????????file_list?=?os.listdir(class_path)????????????''?????????????????for?file_path?in?file_list:????????????????fullname?=?class_path?+?file_path????????????????content?=?readfile(fullname)????????????????''????????????????content?=?content.replace("\r\n",?"")????????????????content?=?content.replace("?",?"")??????????????content_seg?=?jieba.cut(content)????????????????savefile(seg_dir?+?file_path,?"?".join(content_seg))??????????print?"中文語料分詞結束!!!"????''??????????if?__name__=="__main__":????????????corpus_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/"????????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"????????corpus_segment(corpus_path,seg_path)??????????????corpus_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/"????????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"????????corpus_segment(corpus_path,seg_path)??
2.py(corpus2Bunch.py)
[python] view plaincopy
?????????????import?sys??reload(sys)??sys.setdefaultencoding('utf-8')??import?os??import?cPickle?as?pickle??''????????from?sklearn.datasets.base?import?Bunch??????????def?_readfile(path):??????''??????????????????????????????with?open(path,?"rb")?as?fp:??????????content?=?fp.read()??????return?content????def?corpus2Bunch(wordbag_path,seg_path):??????catelist?=?os.listdir(seg_path)????????????bunch?=?Bunch(target_name=[],?label=[],?filenames=[],?contents=[])??????bunch.target_name.extend(catelist)??????''???????????????for?mydir?in?catelist:??????????class_path?=?seg_path?+?mydir?+?"/"????????????file_list?=?os.listdir(class_path)????????????for?file_path?in?file_list:????????????????fullname?=?class_path?+?file_path????????????????bunch.label.append(mydir)??????????????bunch.filenames.append(fullname)??????????????bunch.contents.append(_readfile(fullname))????????????????''????????????with?open(wordbag_path,?"wb")?as?file_obj:??????????pickle.dump(bunch,?file_obj)??????print?"構建文本對象結束!!!"????if?__name__?==?"__main__":????????????wordbag_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"????????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"????????corpus2Bunch(wordbag_path,?seg_path)??????????????wordbag_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"????????seg_path?=?"/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"????????corpus2Bunch(wordbag_path,?seg_path)??
3.py(TFIDF_space.py)
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@version: python2.7.8
@author: XiangguoSun
@contact: sunxiangguodut@qq.com
@file: TFIDF_space.py
@time: 2017/2/8 11:39
@software: PyCharm
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')from sklearn.datasets.base import Bunch
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizerdef _readfile(path):with open(path, "rb") as fp:content = fp.read()return contentdef _readbunchobj(path):with open(path, "rb") as file_obj:bunch = pickle.load(file_obj)return bunchdef _writebunchobj(path, bunchobj):with open(path, "wb") as file_obj:pickle.dump(bunchobj, file_obj)def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):stpwrdlst = _readfile(stopword_path).splitlines()bunch = _readbunchobj(bunch_path)tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})if train_tfidf_path is not None:trainbunch = _readbunchobj(train_tfidf_path)tfidfspace.vocabulary = trainbunch.vocabularyvectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)else:vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)tfidfspace.vocabulary = vectorizer.vocabulary__writebunchobj(space_path, tfidfspace)print "if-idf詞向量空間實例創建成功!!!"if __name__ == '__main__':# stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#輸入的文件# bunch_path = "train_word_bag/train_set.dat"#輸入的文件# space_path = "train_word_bag/tfdifspace.dat"#輸出的文件# vector_space(stopword_path,bunch_path,space_path)## bunch_path = "test_word_bag/test_set.dat"#輸入的文件# space_path = "test_word_bag/testspace.dat"# train_tfidf_path="train_word_bag/tfdifspace.dat"# vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#輸入的文件train_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/train_set.dat"#輸入的文件space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#輸出的文件vector_space(stopword_path,train_bunch_path,space_path)train_tfidf_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/tfidfspace.dat" # 輸入的文件,由上面生成test_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_word_bag/test_set.dat"#輸入的文件test_space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_word_bag/testspace.dat"#輸出的文件vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path)
4.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@version: python2.7.8
@author: XiangguoSun
@contact: sunxiangguodut@qq.com
@file: NBayes_Predict.py
@time: 2017/2/8 12:21
@software: PyCharm
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')import cPickle as pickle
from sklearn.naive_bayes import MultinomialNB # 導入多項式貝葉斯算法# 讀取bunch對象
def _readbunchobj(path):with open(path, "rb") as file_obj:bunch = pickle.load(file_obj)return bunch# 導入訓練集
trainpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/tfidfspace.dat"
train_set = _readbunchobj(trainpath)# 導入測試集
testpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath)######################################################
# SVM Classifier
from sklearn.svm import SVCprint '*************************\nSVM\n*************************'
clf = SVC(kernel='linear') # default with 'rbf'
clf.fit(train_set.tdm, train_set.label)# 預測分類結果
predicted = clf.predict(test_set.tdm)for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):if flabel != expct_cate:print file_name,": 實際類別:",flabel," -->預測類別:",expct_cateprint "預測完畢!!!"# 計算分類精度:
from sklearn import metrics
def metrics_result(actual, predict):print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))metrics_result(test_set.label, predicted)
依然使用復旦大學的新聞數據集
運行結果(這里復制一部分):
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics13.txt : 實際類別: C16-Electronics? -->預測類別: C19-Computer
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics48.txt : 實際類別: C16-Electronics? -->預測類別: C34-Economy
預測完畢!!!
精度:0.928
召回:0.927
f1-score:0.921
Process finished with exit code 0
用SVM時間會比較長,請耐心等待。
總結
以上是生活随笔為你收集整理的SVM针对中文文本分类的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。