ik 和hanlp_结巴分词和自然语言处理HanLP处理手记
手記實(shí)用系列文章:
代碼封裝類:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import jieba
import os
import re
import time
from jpype import *
'''
title:利用結(jié)巴分詞進(jìn)行文本語(yǔ)料的批量處理
1 首先對(duì)文本進(jìn)行遍歷查找
2 創(chuàng)建原始文本的保存結(jié)構(gòu)
3 對(duì)原文本進(jìn)行結(jié)巴分詞和停用詞處理
4 對(duì)預(yù)處理結(jié)果進(jìn)行標(biāo)準(zhǔn)化格式,并保存原文件結(jié)構(gòu)路徑
author:白寧超
myblog:http://www.cnblogs.com/baiboy/
time:2017年4月28日10:03:09
'''
'''
創(chuàng)建文件目錄
path:根目錄下創(chuàng)建子目錄
'''
def mkdir(path):
# 判斷路徑是否存在
isExists=os.path.exists(path)
# 判斷結(jié)果
if not isExists:
os.makedirs(path)
print(path+' 創(chuàng)建成功')
return True
else:
pass
print('-->請(qǐng)稍后,文本正在預(yù)處理中...')
'''
結(jié)巴分詞工具進(jìn)行中文分詞處理:
read_folder_path:待處理的原始語(yǔ)料根路徑
write_folder_path 中文分詞經(jīng)數(shù)據(jù)清洗后的語(yǔ)料
'''
def CHSegment(read_folder_path,write_folder_path):
stopwords ={}.fromkeys([line.strip() for line in open('../Database/stopwords/CH_stopWords.txt','r',encoding='utf-8')]) # 停用詞表
# 獲取待處理根目錄下的所有類別
folder_list = os.listdir(read_folder_path)
# 類間循環(huán)
# print(folder_list)
for folder in folder_list:
#某類下的路徑
new_folder_path = os.path.join(read_folder_path, folder)
# 創(chuàng)建一致的保存文件路徑
mkdir(write_folder_path+folder)
#某類下的保存路徑
save_folder_path = os.path.join(write_folder_path, folder)
#某類下的全部文件集
# 類內(nèi)循環(huán)
files = os.listdir(new_folder_path)
j = 1
for file in files:
if j > len(files):
break
# 讀取原始語(yǔ)料
raw = open(os.path.join(new_folder_path, file),'r',encoding='utf-8').read()
# 只保留漢字
# raw1 = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\\/\?\~\!\@\#\\\&\*\%]", "", raw)
# jieba分詞
wordslist = jieba.cut(raw, cut_all=False) # 精確模式
# 停用詞處理
cutwordlist=''
for word in wordslist:
if word not in stopwords and word=="\n":
cutwordlist+="\n" # 保持原有文本換行格式
elif len(word)>1 :
cutwordlist+=word+"/" #去除空格
#保存清洗后的數(shù)據(jù)
with open(os.path.join(save_folder_path,file),'w',encoding='utf-8') as f:
f.write(cutwordlist)
j += 1
'''
結(jié)巴分詞工具進(jìn)行中文分詞處理:
read_folder_path:待處理的原始語(yǔ)料根路徑
write_folder_path 中文分詞經(jīng)數(shù)據(jù)清洗后的語(yǔ)料
'''
def HanLPSeg(read_folder_path,write_folder_path):
startJVM(getDefaultJVMPath(), "-Djava.class.path=C:\hanlp\hanlp-1.3.2.jar;C:\hanlp", "-Xms1g", "-Xmx1g") # 啟動(dòng)JVM,Linux需替換分號(hào);為冒號(hào):
stopwords ={}.fromkeys([line.strip() for line in open('../Database/stopwords/CH_stopWords.txt','r',encoding='utf-8')]) # 停用詞表
# 獲取待處理根目錄下的所有類別
folder_list = os.listdir(read_folder_path)
# 類間循環(huán)
# print(folder_list)
for folder in folder_list:
#某類下的路徑
new_folder_path = os.path.join(read_folder_path, folder)
# 創(chuàng)建一致的保存文件路徑
mkdir(write_folder_path+folder)
#某類下的保存路徑
save_folder_path = os.path.join(write_folder_path, folder)
#某類下的全部文件集
# 類內(nèi)循環(huán)
files = os.listdir(new_folder_path)
j = 1
for file in files:
if j > len(files):
break
# 讀取原始語(yǔ)料
raw = open(os.path.join(new_folder_path, file),'r',encoding='utf-8').read()
# HanLP分詞
HanLP = JClass('com.hankcs.hanlp.HanLP')
wordslist = HanLP.segment(raw)
#保存清洗后的數(shù)據(jù)
wordslist1=str(wordslist).split(",")
# print(wordslist1[1:len(wordslist1)-1])
flagresult=""
# 去除標(biāo)簽
for v in wordslist1[1:len(wordslist1)-1]:
if "/" in v:
slope=v.index("/")
letter=v[1:slope]
if len(letter)>0 and '\n\u3000\u3000' in letter:
flagresult+="\n"
else:flagresult+=letter +"/" #去除空格
# print(flagresult)
with open(os.path.join(save_folder_path,file),'w',encoding='utf-8') as f:
f.write(flagresult.replace(' /',''))
j += 1
shutdownJVM()
if __name__ == '__main__' :
print('開(kāi)始進(jìn)行文本分詞操作:\n')
t1 = time.time()
dealpath="../Database/SogouC/FileTest/"
savepath="../Database/SogouCCut/FileTest/"
# 待分詞的語(yǔ)料類別集根目錄
read_folder_path = '../Database/SogouC/FileNews/'
write_folder_path = '../Database/SogouCCut/'
#jieba中文分詞
CHSegment(read_folder_path,write_folder_path) #300個(gè)txtq其中結(jié)巴分詞使用3.31秒
HanLPSeg(read_folder_path,write_folder_path) #300個(gè)txt其中hanlp分詞使用1.83秒
t2 = time.time()
print('完成中文文本切分: '+str(t2-t1)+"秒。")
運(yùn)行效果:
總結(jié)
以上是生活随笔為你收集整理的ik 和hanlp_结巴分词和自然语言处理HanLP处理手记的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 三层架构秘籍
- 下一篇: 基于SpringBoot打造的OA、CM