當(dāng)前位置：首頁 > 编程语言 > python >内容正文

python

使用python将pdf转成txt的实践

發(fā)布時(shí)間：2024/1/1 python 48 豆豆

生活随笔收集整理的這篇文章主要介紹了使用python将pdf转成txt的实践小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

顯示錯(cuò)誤，可以直接看知乎：https://zhuanlan.zhihu.com/p/468708078?

一.工具準(zhǔn)備：
1.pdfconv庫:
GitHub - xieyan0811/pdfconv: 中文PDF轉(zhuǎn)TXT的實(shí)用工具
2.chi_sim中文包:
https://github.com/tesseract-ocr/tessdata/blob/main/chi_sim.traineddata
3.安裝依賴包:
a.先安裝homebrew
由于我是新款mac，搭載M1芯片，因此有額外報(bào)錯(cuò)：(i)
curl: (7) Failed to connect to raw.githubusercontent.com port 443: Operation timed out
需要這樣安裝brew：
arch -x86_64 /bin/bash -c “$(curl -fsSL https://cdn.jsdelivr.net/gh/ineo6/homebrew-install/install.sh)”
b.安裝tessdata依賴包
右鍵終端選擇使用描述文件新建窗口，選擇homebrew,執(zhí)行：brew install tesseract
pip3 install pymupdf==1.18.19 # PDF格式解析
pip3 install pytesseract # OCR工具的Python支持包
pip3 install baidu-aip # 在線OCR：百度提供的字符識(shí)別工具。
將下載好的sim_chi中文包放到/usr/local/share/tessdata文件夾內(nèi)（沒有這個(gè)目錄就建一下）
二.代碼執(zhí)行
將https://github.com/xieyan0811/pdfconv里面的pdfconv.py拿過來，里面有少許錯(cuò)誤，我修復(fù)了一下，可以正常執(zhí)行，見下文：
######################################

tesseract OCR

from PIL import Image
import pytesseract

def img_to_str_tesseract(image_path, lang=‘chi_sim’):
return pytesseract.image_to_string(Image.open(image_path), lang)

######################################

百度 OCR

from aip import AipOcr

config = {
‘a(chǎn)ppId’: ‘’,
‘a(chǎn)piKey’: ‘’,
‘secretKey’: ‘’
}

client = AipOcr(**config)

def img_to_str_baidu(image_path):
with open(image_path, ‘rb’) as fp:
image = fp.read()
result = client.basicGeneral(image)
if ‘words_result’ in result:
return ‘\n’.join([w[‘words’] for w in result[‘words_result’]])
return “”

######################################

解析PDF文件

import fitz
import time
import re
import os
import sys
import numpy as np

TMPDIR = ‘tmp/’
PARSEIMG = True
OCR_ONLINE = False

去掉文中多余的回車

def adjust(inpath, outpath):
f = open(inpath)
lines = f.readlines()
arr = [len(line) for line in lines]
length = np.median(arr) # 行字符數(shù)中值

string = "" for line in lines:if len(line) >= length and line[-1] == '\n':string += line[:-1] # 去掉句尾的回車elif line == '-----------\n':passelse:string += line write_file(outpath, string, 'w') return

寫入文件

def write_file(path, text, ftype, debug=False):
with open(path, ftype) as f:
if debug:
print(“write”, len(text))
f.write(text)
f.close()

刪除文件

def remove(path):
if not os.path.exists(path):
return
if os.path.isfile(path):
os.remove(path)
return
dirs = os.listdir(path)
for f in dirs:
file_name = os.path.join(path, f)
if os.path.isfile(file_name):
os.remove(file_name)
else:
remove(file_name)
os.rmdir(path)

解析PDF文件

def parse(inpath, outpath):
remove(TMPDIR) # 清除臨時(shí)目錄
os.mkdir(TMPDIR)
remove(outpath) # 清除輸出文件

# t0 = time.clock() t0 = time.process_time doc = fitz.open(inpath) lenXREF = doc.xrefLength() print("文件名:{}, 頁數(shù): {}, 對(duì)象: {}".format(inpath, len(doc), lenXREF - 1))imgcount = 0 for i, page in enumerate(doc):# t1 = time.clock()t1 = time.process_time# 文字text = page.get_text()if len(text) > 0:write_file(outpath, text, 'a')# 圖片imglist = page.get_images() # 解析該頁中圖片for item in imglist:xref = item[0]pix = fitz.Pixmap(doc, xref)new_name = "{}.png".format(imgcount)# 如果pix.n<5,可以直接存為PNGpath = os.path.join(TMPDIR, new_name)if pix.n < 5:pix.writePNG(path)# 否則先轉(zhuǎn)換CMYKelse:pix0 = fitz.Pixmap(fitz.csRGB, pix)pix0.writePNG(path)pix0 = Nonepix = Noneif OCR_ONLINE:text = img_to_str_baidu(path)else:text = img_to_str_tesseract(path)print("img->text", text)write_file(outpath, text, 'a')write_file(outpath, '\n' + '-----------' + '\n', 'a')imgcount += 1# print("page {} 運(yùn)行時(shí)間:{}s".format(i, t1 - t0))

if name == ‘main’:
if len(sys.argv) < 2:
print(“請(qǐng)將pdf文件路徑作為參數(shù)”)
sys.exit(-1)
pdffile = sys.argv[1]
tmpfile = pdffile.replace(‘pdf’, ‘tmp’)
txtfile = pdffile.replace(‘pdf’, ‘txt’)
parse(pdffile, tmpfile)
adjust(tmpfile, txtfile)
在當(dāng)前目錄執(zhí)行python pdfconv.py xxx.pdf即可，最終會(huì)生成txt

參考文章：
Mac上安裝Tesseract-Ocr并測(cè)試
brew 無法安裝iterm2_MacBook Pro M1 通過 Rosetta 2 安裝 Homebrew 教程
GitHub - xieyan0811/pdfconv: 中文PDF轉(zhuǎn)TXT的實(shí)用工具
GitHub - tesseract-ocr/tessdata: Trained models with support for legacy and LSTM OCR engine

總結(jié)

以上是生活随笔為你收集整理的使用python将pdf转成txt的实践的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇： [乐意黎转载]使用光影魔术手处理照片步骤
下一篇：下取整函数的含义_取整函数