當前位置：首頁 > 编程语言 > python >内容正文

python

python清洗文本数据_02.数据预处理之清洗文本信息

發布時間：2024/7/5 python 29 豆豆

生活随笔收集整理的這篇文章主要介紹了 python清洗文本数据_02.数据预处理之清洗文本信息小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

準備30萬條新聞數據

編號

新聞類別

新聞數量(條)

財經

37098

教育

41963

科技

65534

時政

63086

體育

65534

娛樂

65534

yield生成器

斐波那契數列介紹和數學描述

斐波那契數列算法實現

斐波那契數列算法生成器實現

算法時間性能對比分析

# coding=utf-8

"""

Description：yield生成器案例

"""

import time

"""

斐波那契數列：1,1,2,3,5,8,13,21,34,55,89,144

從數列的第三項開始，后面每一項是前面兩項之和

數學上的定義：F(0) = 1, F(1) = 1, F(n) = F(n-1) + F(n-2)(n>=2, n∈N)

"""

# 普通的斐波那契數列實現

def fab1(max):

n, a, b = 0, 0, 1

while n < max:

# print('->', b)

a, b = b, a + b

n = n + 1

# 生成器算法實現斐波那契數列

def fab2(max):

n, a, b = 0, 0, 1

while n < max:

yield b # 使用yield生成器

a, b = b, a + b

n = n + 1

def test():

# 最大迭代數

max_num = 1000000

start_time = time.time()

fab1(max_num)

end_time = time.time()

print('fab1 total time %.2f' % (1000 * (end_time - start_time)), 'ms')

start_time = time.time()

# B為一個

# 可以通過遍歷進行獲取值

b = fab2(max_num)

end_time = time.time()

print('fab2 total time %.2f' % (1000 * (end_time - start_time)), 'ms')

if __name__ == '__main__':

test()

輸出：

fab1 total time 7085.11 ms

fab2 total time 0.00 ms

遞歸讀取30萬新聞

實現文件遍歷遞歸

遍歷讀取30萬新聞

每萬條讀取打印一次

完成30萬新聞遍歷讀取

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

"""

@Description : 遞歸批量讀取30W文本

@Time : 2020/1/2 23:41

@Author : sky

@Site :

@File : FileRead.py

@Software: PyCharm

"""

import os

import time

def traversal_dir(root_dir):

"""

返回指定目錄包含的文件或文件夾名字列表

:param root_dir: 根目錄

:return: 文件(文件夾)名字列表

"""

for index, file_name in enumerate(os.listdir(root_dir)):

# 待處理文件名字列表

child_file_path = os.path.join(root_dir, file_name)

if os.path.isfile(child_file_path):

# 對文件進行操作

if index % 10000 == 0:

print('{c} *** {t} *** {i} \t docs has been read'

.format(c=root_dir, i=index, t=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))

elif os.path.isdir(child_file_path):

# 遞歸遍歷文件目錄

traversal_dir(child_file_path)

if __name__ == '__main__':

root_dir = '../dataset/CSCMNews'

start_time = time.time()

traversal_dir(root_dir)

end_time = time.time()

print('Total Cost Time %.2f' % (end_time - start_time) + 's')

高效讀取30萬新聞

構建生成器

構建迭代器

高效讀取30萬新聞

讀取30萬新聞性能對比

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

"""

# @Description 高效讀取30萬新聞

# @Time : 2020/1/4 14:47

# @Author : sky

# @Site :

# @File : EfficRead.py

# @Software: PyCharm

"""

import os

import time

# 迭代器

class LoadFolders(object):

def __init__(self, parent_path):

self.parent_path = parent_path

def __iter__(self):

for file in os.listdir(self.parent_path):

file_abspath = os.path.join(self.parent_path, file)

if os.path.isdir(file_abspath):

yield file_abspath

class LoadFiles(object):

def __init__(self, parent_path):

self.parent_path = parent_path

def __iter__(self):

folders = LoadFolders(self.parent_path)

# 第一級目錄

for folder in folders:

category = folder.split(os.sep)[-1]

# 第二級目錄

for file in os.listdir(folder):

yield category, file

if __name__ == '__main__':

file_path = os.path.abspath('../dataset/CSCMNews')

start_time = time.time()

files = LoadFiles(file_path)

for index, msg in enumerate(files):

if index % 10000 == 0:

print('{t} *** {i} \t docs has bean read'.format(t=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), i=index))

end_time = time.time()

print('Total Cost Time %.2f' % (end_time - start_time) + 's')

小結

數組、鏈表、字符串、文件等缺點就是所有的

，海量的數據

生成器是可以迭代的，

就是重復調用

,直到沒有下一個

有

不再是一個普通的函數，而是一個

，可用于

。

yield是一個類似

正則清洗數據

分析新聞語料文本

讀取新聞文本信息

正則過濾掉特殊符號、標點、英文、數字等

正則去除空格、換行符、多空格合并等

實現正則清洗文本數據

正則表達式學習

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

"""

# @Description 正則表達式學習

# @Time : 2020/1/4 15:13

# @Author : sky

# @Site :

# @File : reqular.py

# @Software: PyCharm

"""

import re

# . 任意字符

# * 任意次數

# ^ 表示開頭

# $ 表示結尾

# ? 非貪婪模式，提取第一個字符

# + 至少出現一次

# {1}出現一次

# {3,}出現3次以上

# {2,5}最少2次，最多5次

# \d 匹配數字

# [\u4E00-\u9FA5] 匹配漢字

# | 或

# [] 滿足任意一個， [2345] 2345中任意一個 [0-9]區間 [^1]非1

# \s 空格 \S 非空格

# \w 匹配[A-Za-z0-9_] \W=非\w

text = 'this is Python 數據預處理，這次學習的很好，使用的環境是Anaconda4.4，現在的時間是2020年1月4日'

# 開頭 + 任意字符 + 任意次數

reg_1 = '(^t.*)'

# 存在s 這樣會一直匹配到最后一個s

reg_2 = '.*(s+)'

# 存在s 貪婪匹配到第一個s

reg_3 = '.*?(s+)'

# 匹配漢字?的作用同上

reg_4 = '.*?([\u4E00-\u9FA5]+的)'

# 匹配日期

reg_5 = '.*(\d{4}年)(\d{1,2}月)'

res = re.match(reg_5, text)

if res:

print(res)

print(res.group(2)) # group對應的數字是正則中括號的內容

else:

print('沒匹配到')

# 日期的提取

print('-'*20)

date_text = '現在的日期是2020年1月4日'

# date_text = '現在的日期是2020年01月04日'

# date_text = '現在的日期是2020-1-4'

# date_text = '現在的日期是2020-01-04'

# date_text = '現在的日期是2020/1/4'

# date_text = '現在的日期是2020/01/04'

# date_text = '現在的日期是2020-01'

reg_date = '.*(\d{4}[年/-]\d{1,2}[月/-]\d{,2}[日]?)'

res = re.match(reg_date, date_text)

if res:

print(res)

print(res.group(1)) # group對應的數字是正則中括號的內容

else:

print('沒匹配到')

# 手機號的提取

print('-'*20)

# phone_text = '我的手機號是13030010152，有什么問題可以聯系我'

# phone_text = '我的手機號是17091033442，有什么問題可以聯系我'

# phone_text = '我的手機號是18519299012，有什么問題可以聯系我'

phone_text = '我的手機號是13691769664 ，有什么問題可以聯系我'

reg_phone = '.*?(1[37859]\d{9})'

res = re.match(reg_phone, phone_text)

if res:

print(res)

print(res.group(1)) # group對應的數字是正則中括號的內容

else:

print('沒匹配到')

清洗文本數據

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

"""

# @Description 正則清洗文本數據

# @Time : 2020/1/4 15:52

# @Author : sky

# @Site :

# @File : REdealText.py

# @Software: PyCharm

"""

import re

# 正則對字符串的清洗

def text_parse(text):

# 正則過濾掉特殊符號，標點，英文，數字等

reg_1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./:：;；|<=>?@，—。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'

# 去除空格

reg_2 = '\\s+'

text = re.sub(reg_1, ' ', text)

text = re.sub(reg_2, ' ', text)

# 去除換行符

text = text.replace('\n', '')

return text

def read_file(path):

str_doc = ''

with open(path, encoding='utf-8') as f:

str_doc = f.read()

return str_doc

if __name__ == '__main__':

# 讀取文本

str_doc_path = '../dataset/CSCMNews/體育/0.txt'

str_doc = read_file(str_doc_path)

print(str_doc)

# 數據清洗

clear_text = text_parse(str_doc)

print(clear_text)

清洗HTML數據

分析html文本信息

導入正則：re.I、re.L、re.M、re.S……

清洗HTML標簽：DOCTYPE、CDATA、Script、style……

HTML標簽、注釋、換行等處理：re.compile

實現正則清洗HTML數據

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

"""

# @Description 正則清洗HTML

# @Time : 2020/1/4 18:46

# @Author : sky

# @Site :

# @File : DealHtml.py

# @Software: PyCharm

"""

import re

# 清洗HTML標簽文本

def filter_tags(html_str):

# 去掉多余的空格

html_str = ' '.join(html_str.split())

# 過濾DOCTYPE

re_doctype = re.compile(r'*?>', re.S)

res = re_doctype.sub('', html_str)

# 過濾CDATA

re_cdata = re.compile(r'//]? //\] > ', re.I)

res = re_cdata.sub('', res)

# Script

re_script = re.compile(r']*>[^', re.I)

res = re_script.sub('', res)

# style

re_style = re.compile(r']*>[^', re.I)

res = re_style.sub('', res) # 去掉style

# 處理換行

re_br = re.compile(r'
')

res = re_br.sub('', res) # 將br轉換為換行

# HTML標簽

re_h = re.compile(r'?\w+[^>]*>')

res = re_h.sub('', res) # 去掉HTML 標簽

# HTML注釋

re_comment = re.compile(r'')

res = re_comment.sub('', res)

# 多余的空行

blank_line = re.compile(r'\n+')

res = blank_line.sub('', res)

blank_line_l = re.compile(r'\n')

res = blank_line_l.sub('', res)

blank_kon = re.compile(r'\t')

res = blank_kon.sub('', res)

blank_one = re.compile(r'\r\n')

res = blank_one.sub('', res)

blank_two = re.compile(r'\r')

res = blank_two.sub('', res)

blank_three = re.compile(r' ')

res = blank_three.sub('', res)

# 剔除超鏈接

http_link = re.compile(r'(http://.+.html)')

res = http_link.sub('', res)

return res

def read_file(path):

str_doc = ''

with open(path, encoding='utf-8') as f:

str_doc = f.read()

return str_doc

if __name__ == '__main__':

str_doc = read_file('./htmldemo.txt')

res = filter_tags(str_doc)

print(res)

簡繁體轉換

簡繁體python包下載與使用

簡繁對照表分析與自定義添加

python實現繁體文本轉為簡體文本

python實現簡體文本轉為繁體文本

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

"""

# @Description 簡繁體轉換

# @Time : 2020/1/4 19:08

# @Author : sky

# @Site :

# @File : zhline.py

# @Software: PyCharm

"""

from ClearText.zhtools.langconv import *

# 1、簡體字轉化為繁體

str1 = '上港5-4恒大5分領跑劍指冠軍，下輪打平便可奪冠，' \

'武磊平紀錄—廣州恒大淘寶|上海上港|蔡慧康|武磊|胡爾克|張成林|阿蘭|保利尼奧|王燊超|呂文君|懂球帝' \

'北京時間11月3日19:35，中超第28輪迎來天王山之戰，廣州恒大淘寶坐鎮主場迎戰上海上港。' \

'上半場呂文君和蔡慧康先后進球兩度為上港取得領先，保利尼奧和阿蘭兩度為恒大將比分扳平，' \

'補時階段保利尼奧進球反超比分；下半場武磊進球追平李金羽單賽季進球紀錄，王燊超造成張成林烏龍，' \

'胡爾克點射破門，阿蘭補時打進點球。最終，上海上港客場5-4戰勝廣州恒大淘寶，' \

'賽季雙殺恒大同時也將積分榜上的領先優勢擴大到五分，上港下輪只要戰平就將奪得冠軍。非常抱歉！'

line1 = Converter('zh-hant').convert(str1)

print('繁體---\n', line1)

# 2、繁體字轉化為簡體

str2 = '上港5-4恆大5分領跑劍指冠軍，下輪打平便可奪冠，' \

'武磊平紀錄—廣州恆大淘寶|上海上港|蔡慧康|武磊|胡爾克|張成林|阿蘭|保利尼奧|王燊超|呂文君|懂球帝' \

'北京時間11月3日19:35，中超第28輪迎來天王山之戰，廣州恆大淘寶坐鎮主場迎戰上海上港。' \

'上半場呂文君和蔡慧康先後進球兩度為上港取得領先，保利尼奧和阿蘭兩度為恆大將比分扳平，' \

'補時階段保利尼奧進球反超比分；下半場武磊進球追平李金羽單賽季進球紀錄，王燊超造成張成林烏龍，' \

'胡爾克點射破門，阿蘭補時打進點球。最終，上海上港客場5-4戰勝廣州恆大淘寶，' \

'賽季雙殺恆大同時也將積分榜上的領先優勢擴大到五分，上港下輪只要戰平就將奪得冠軍。非常抱歉！'

line2 = Converter('zh-hans').convert(str2)

print('簡體---\n', line2)

30萬新聞數據清洗

高效讀取30W新聞文本

實現新聞文本抽樣

抽樣新聞文本數據清洗

每萬條打印一次信息

實現30W新聞文本數據清洗

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

"""

# @Description 30W新聞文本數據清洗

# @Time : 2020/1/4 19:35

# @Author : sky

# @Site :

# @File : 30wClear.py

# @Software: PyCharm

"""

import os

import re

import time

from ClearText.REdealText import text_parse

# 高效讀取文件

# 迭代器

class LoadFolders(object):

def __init__(self, parent_path):

self.parent_path = parent_path

def __iter__(self):

for file in os.listdir(self.parent_path):

file_abspath = os.path.join(self.parent_path, file)

if os.path.isdir(file_abspath):

yield file_abspath

class LoadFiles(object):

def __init__(self, parent_path):

self.parent_path = parent_path

def __iter__(self):

folders = LoadFolders(self.parent_path)

# 第一級目錄

for folder in folders:

category = folder.split(os.sep)[-1]

# 第二級目錄

for file in os.listdir(folder):

file_path = os.path.join(folder, file)

if os.path.isfile(file_path):

this_file = open(file_path, 'rb') # rb讀取快

content = this_file.read().decode('utf-8')

yield category, content

this_file.close()

if __name__ == '__main__':

start_time = time.time()

file_path = '../dataset/CSCMNews'

files = LoadFiles(file_path)

# 抽樣

n = 2

for index, msg in enumerate(files):

if index % n == 0:

category = msg[0]

content = msg[1]

content = text_parse(content)

if int(index / n) % 10000 == 0:

print(

'{t} *** {i} \t docs has bean dealed'.format(t=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),

i=index), '\n', category, '\t', content[:20])

end_time = time.time()

print('Total Cost Time %.2f' % (end_time - start_time) + 's')

擴展

缺失值處理方法(忽略/人工填值/均值/中位數等)

噪聲數據處理方式(向均值光滑、離群點分析等)

正則學習

內容均來源于學習資料，在學習過程中進行記錄，如有侵權聯系作者進行刪除

Change the world by program

總結

以上是生活随笔為你收集整理的python清洗文本数据_02.数据预处理之清洗文本信息的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：数据结构--链表--判断一个字符串是否为
下一篇： html校园首页设计说明范文,网页设计作