當前位置：首頁 > 编程语言 > python >内容正文

python

Python3.6 word批量转换为txt提取

發布時間：2023/12/10 python 27 豆豆

生活随笔收集整理的這篇文章主要介紹了 Python3.6 word批量转换为txt提取小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

1.流程：批量讀取文件夾下文件，批量轉換word為txt文件，讀取txt文件內容

2.word文件放入：?D:\jianli? ?，文件夾下放入一個word文件

代碼如下：

注意導入庫

mport os
import re
import sys
import psutil
import win32com.client as wc
import configparser
import time

# -*- coding:utf-8 -*-import os import re import sys import psutil import win32com.client as wc import configparser import time'''自動簡歷解析 Python腳本'''# 關閉 wps 進程 def printPids():pids = psutil.pids()for pid in pids:try:p = psutil.Process(pid)# print('pid=%s,pname=%s' % (pid, p.name()))# 關閉excel進程if p.name() == 'wps.exe':print("關閉wps.exe")cmd = 'taskkill /F /IM wps.exe'os.system(cmd)except Exception as e:print(e)def getWordPath(path1, path2):# doc文件另存為docxword = wc.Dispatch("Word.Application")doc = word.Documents.Open(path1)# 上面的地方只能使用完整絕對地址，相對地址找不到文件，且，只能用“\\”，不能用“/”，哪怕加了 r 也不行，涉及到將反斜杠看成轉義字符。doc.SaveAs(path2, 2, False, "", True, "", False, False, False,False) # 轉換后的文件,12代表轉換后為docx文件# doc.SaveAs(r"F:\\***\\***\\appendDoc\\***.docx", 12)#或直接簡寫# 注意SaveAs會打開保存后的文件，有時可能看不到，但后臺一定是打開的doc.Closeword.Quit# 遍歷文件夾 def walkFile(file):file_list = []for root, dirs, files in os.walk(file):# root 表示當前正在訪問的文件夾路徑# dirs 表示該文件夾下的子目錄名list# files 表示該文件夾下的文件list# 遍歷文件for f in files:pathtem = os.path.join(root, f)# 只需要后綴為.doc的文件if ".doc" in str(pathtem):if ".docx" in str(pathtem):continueelse:print(pathtem)file_list.append([pathtem])# 遍歷所有的文件夾# for d in dirs:# print(os.path.join(root, d))return file_list# 刪除指定路徑下的所有文件 def removeFilesByPath(file):file_list = []for root, dirs, files in os.walk(file):# 遍歷文件for f in files:pathtem = os.path.join(root, f)# 刪除文件os.remove(pathtem)# 轉換dox 為 doxc 文件，以便Python庫解析 def fileDocToDocx(path1, path2, file_list):txtFilesList = []# print(file_list)for l in file_list:temppath1 = l[0]if ".docx" not in temppath1:temppath2 = path2 + temppath1.replace(path1, "") + "x"else:temppath2 = path2 + temppath1.replace(path1, "")temppath2 = temppath2.replace(".docx", ".txt")print(temppath1, temppath2)txtFilesList.append([temppath2])# doc文件另存為docxgetWordPath(temppath1, temppath2)return txtFilesList# 正則表達式提取中文 def translate_zh(str):line = str # str.strip().decode('utf-8', 'ignore') # 處理前進行相關的處理，包括轉換成Unicode等p2 = re.compile(r'[^\u4e00-\u9fa5]') # 中文的編碼范圍是：\u4e00到\u9fa5zh = " ".join(p2.split(line)).strip()zh = ",".join(zh.split())outStr = zh # 經過相關處理后得到中文的文本return outStr# 讀取text文本內容 def readTxt(filePath):texts = ""fileobj = open(filePath, mode='r')all_lines = fileobj.readlines()for line in all_lines:line = str(line).replace(" ", "").replace("\n", "")# print(line, str(len(line)))if len(line) == 4:line = "####" + str(line)texts = texts + linefileobj.close()return texts.replace("????", "??")# 解析word 智聯數據 if __name__ == '__main__':path1 = "D:\\jianli\\"path2 = "D:\\jianli\\"# 獲取所有的doc文件file_list = walkFile(path1)# 轉換dox 為 txt 文件，以便Python庫解析txtxFilesList = fileDocToDocx(path1, path2, file_list)# print("待解析文件:", txtxFilesList)print("待解析文件集合大小:", len(txtxFilesList))# 解析文件內容time.sleep(1)printPids()time.sleep(2)for dx in txtxFilesList:try:print("文件路徑-----：", dx[0])filePath = dx[0]texts = readTxt(filePath)print("文檔內容", texts)except Exception as e:print("解析異常：", filePath, e)continue

文檔存放路徑可以自信修改，運行代碼后: 同級目錄出現 xxx.txt文件

總結

以上是生活随笔為你收集整理的Python3.6 word批量转换为txt提取的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：网站主机和服务器选哪个,虚拟主机和服务器
下一篇： .f' '或者.F' '或者string

python

Python3.6 word批量转换为txt提取

1.流程：批量讀取文件夾下文件，批量轉換word為txt文件，讀取txt文件內容

2.word文件放入：?D:\jianli? ?，文件夾下放入 一個word文件

總結

2.word文件放入：?D:\jianli? ?，文件夾下放入一個word文件