PDF区域文本提取工具
- 📢博客主頁(yè):https://blog.csdn.net/as604049322
- 📢歡迎點(diǎn)贊 👍 收藏 ?留言 📝 歡迎討論!
- 📢本文由 小小明-代碼實(shí)體 原創(chuàng),首發(fā)于 CSDN🙉
- 📢未來很長(zhǎng),值得我們?nèi)Ρ几案篮玫纳?
去年年底,我分享了通過python對(duì)PDF指定區(qū)域提取文字,詳見:
對(duì)pdf指定區(qū)域截圖并提取文字
https://blog.csdn.net/as604049322/article/details/111939952
我們?cè)谟么a不斷測(cè)試區(qū)域范圍時(shí)還挺需要感覺的,試想如果我們可以開發(fā)一個(gè)圖像化工具,直接鼠標(biāo)框選提取區(qū)域就精準(zhǔn)的太多了,不需要不斷的測(cè)試。
經(jīng)過幾天的研究,完全不會(huì)wxpython總算是實(shí)現(xiàn)了一個(gè)非常精簡(jiǎn)的圖像化的PDF區(qū)域選擇提取工具,整體效果如下:
功能簡(jiǎn)介
打開軟件后界面如下:
點(diǎn)擊打開文件按鈕打開之前的PDF文件后效果如下:
框選區(qū)域后,標(biāo)題欄會(huì)自動(dòng)顯示當(dāng)前框選的區(qū)域提取到的文字,還可以左右按鈕切換:
實(shí)際我們需要提取文字的區(qū)域可能不止這一個(gè),所以程序支持多區(qū)域框選:
完成區(qū)域框選后就可以點(diǎn)擊保存文件,將PDF每頁(yè)提取到的文本保存到一個(gè)csv文件中,當(dāng)前選區(qū)的保存結(jié)果如下:
可以看到已經(jīng)按框選順序依次保存了每一個(gè)區(qū)域的字符串。
如果選擇區(qū)域時(shí)發(fā)現(xiàn)提取結(jié)果不準(zhǔn)確,可以撤銷后重新選擇:
保存圖片則會(huì)將PDF的每頁(yè)的整體保存為一張圖片,未選擇區(qū)域時(shí),以頁(yè)碼為文件名保存圖片:
選擇區(qū)域時(shí),會(huì)自動(dòng)提取最后一個(gè)區(qū)域提取的文本作為當(dāng)前頁(yè)的文件名:
開發(fā)代碼
當(dāng)然這個(gè)項(xiàng)目由于本人是一次使用wxpython,功能非常簡(jiǎn)約,現(xiàn)在將完整代碼開源出來期待各位大佬的改進(jìn)。
源碼和已編譯工具下載地址:
https://codechina.csdn.net/as604049322/python_gui
完整代碼:
""" 小小明的代碼 CSDN主頁(yè):https://blog.csdn.net/as604049322 """ __author__ = '小小明' __time__ = '2021/11/24'import csvimport wx import os import fitzclass MyCanvas(wx.Panel):def __init__(self, parent):wx.Panel.__init__(self, parent)self.parent = parentself.rects = []self.Bind(wx.EVT_LEFT_DOWN, self.OnLeftButtonEvent)self.Bind(wx.EVT_LEFT_UP, self.OnLeftButtonEvent)self.Bind(wx.EVT_MOTION, self.OnLeftButtonEvent)self.Bind(wx.EVT_PAINT, self.DoDrawing)b = wx.Button(self, -1, "打開文件", (0, 0))self.Bind(wx.EVT_BUTTON, self.OnButton, b)b = wx.Button(self, -1, "保存文件", (75, 0))self.Bind(wx.EVT_BUTTON, self.save_file, b)b = wx.Button(self, -1, "保存圖片", (150, 0))self.Bind(wx.EVT_BUTTON, self.save_img, b)b = wx.Button(self, -1, "撤銷選區(qū)", (225, 0))self.Bind(wx.EVT_BUTTON, self.back_select, b)b = wx.Button(self, -1, "《", (300, 0), size=(25, 25))self.Bind(wx.EVT_BUTTON, self.previous, b)b = wx.Button(self, -1, "》", (325, 0), size=(25, 25))self.Bind(wx.EVT_BUTTON, self.next, b)self.g1 = wx.Gauge(self, -1, 100, (0, 30), (-1, 100), wx.GA_VERTICAL)def previous(self, evt):if not hasattr(self, "pdfDoc"):returnif self.i > 0:self.i -= 1self.change_pdf_page(self.i, False)self.DoDrawing(-1)if self.rects:self.parent.SetTitle(self.path + "|" + self.extract_pdf_text())def next(self, evt):if not hasattr(self, "pdfDoc"):returnif self.i < self.pageCount - 1:self.i += 1self.change_pdf_page(self.i, False)self.DoDrawing(-1)if self.rects:self.parent.SetTitle(self.path + "|" + self.extract_pdf_text())def back_select(self, evt):if self.rects:self.rects.pop()self.DoDrawing(-1)def OnButton(self, evt):dlg = wx.FileDialog(self, message="選擇一個(gè)PDF文件",defaultDir=os.getcwd(),defaultFile="",wildcard="PDF文件(*.pdf)|*.pdf",style=wx.FD_OPEN | wx.FD_CHANGE_DIR |wx.FD_FILE_MUST_EXIST | wx.FD_PREVIEW)if dlg.ShowModal() == wx.ID_OK:self.rects = []path = dlg.GetPath()self.pdfDoc = fitz.open(path)self.i = 0self.pageCount = self.pdfDoc.pageCountself.change_pdf_page(self.i)self.path = os.path.basename(path)self.parent.SetTitle(self.path)self.DoDrawing(-1)dlg.Destroy()def change_pdf_page(self, i, move=True):page = self.pdfDoc[i]rect = page.rectprint("pdf范圍:", rect)mat = fitz.Matrix(1, 1)pix = page.get_pixmap(matrix=mat, alpha=False, clip=rect)pix.save("tmp.png")self.change_img("tmp.png", move)def save_FileDialog(self, format="csv"):dlg = wx.FileDialog(self, message=f"保存一個(gè){format}文件", defaultDir=os.getcwd(),defaultFile="", wildcard=f"{format}文件(*.{format})|*.{format}", style=wx.FD_SAVE | wx.FD_OVERWRITE_PROMPT)path = Noneif dlg.ShowModal() == wx.ID_OK:path = dlg.GetPath()dlg.Destroy()return pathdef save_img(self, evt):if not hasattr(self, "pdfDoc"):returndlg = wx.DirDialog(self, "選擇圖片保存的文件夾:",style=wx.DD_DEFAULT_STYLE# | wx.DD_DIR_MUST_EXIST# | wx.DD_CHANGE_DIR)mat = fitz.Matrix(1, 1)if dlg.ShowModal() == wx.ID_OK:path = dlg.GetPath()for i in range(self.pdfDoc.pageCount):page = self.pdfDoc[i]clip = page.rectpix = page.get_pixmap(matrix=mat, alpha=False, clip=clip)if self.rects:name = self.extract_pdf_text(page=page, rect=self.rects[-1])else:name = f"p{i:0>3d}"pix.save(f"{path}/{name}.png")self.g1.SetValue((i + 1) * 100 // self.pdfDoc.pageCount)dlg.Destroy()os.system(f"explorer {path}")def save_file(self, evt):if not hasattr(self, "pdfDoc"):returnpath = self.save_FileDialog()if path is None:returndata = []for i in range(self.pdfDoc.pageCount):page = self.pdfDoc[i]row = [self.extract_pdf_text(page, rect)for i, rect in enumerate(self.rects)]data.append(row)with open(path, "w") as f:writer = csv.writer(f, lineterminator="\n")row = [f"區(qū)域{i}" for i in range(1, len(row) + 1)]writer.writerow(row)for row in data:writer.writerow(row)os.system(f"cmd /c start {path}")def extract_pdf_text(self, page=None, rect=None):if page is None:page = self.pdfDoc[self.i]if rect is None:rect = self.rects[-1]a, b, c, d = rectclip = fitz.Rect(a, b, a + c, b + d)text = page.get_text(clip=clip).strip()return textdef change_img(self, img_path, move=True):self.bmp = wx.Bitmap(img_path)self.SetSize(self.bmp.GetSize())self.parent.SetSize(self.parent.GetBestSize())if move:self.parent.Center()def DoDrawing(self, evt):if not hasattr(self, "bmp"):returndc = wx.ClientDC(self)dc.DrawBitmap(self.bmp, 0, 0, True)dc.SetPen(wx.Pen('blue'))dc.SetBrush(wx.Brush('white', wx.BRUSHSTYLE_TRANSPARENT))dc.DrawRectangleList(self.rects)def OnLeftButtonEvent(self, event):if event.LeftDown():self.x, self.y = event.GetPosition()self.rects.append([self.x, self.y, 0, 0])elif event.Dragging():x, y = event.GetPosition()self.rects[-1][2] = x - self.xself.rects[-1][3] = y - self.yself.DoDrawing(-1)elif event.LeftUp():print(self.rects)if self.rects[-1][2] < 5 or self.rects[-1][3] < 5:self.rects.pop()else:self.parent.SetTitle(self.path + "|" + self.extract_pdf_text())app = wx.App() frm = wx.Frame(None) pnl = MyCanvas(frm) frm.Center() frm.Show() frm.SetTitle("PDF文本提取器") app.MainLoop()總結(jié)
以上是生活随笔為你收集整理的PDF区域文本提取工具的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: [vue]vue渲染模板时怎么保留模板中
- 下一篇: [css] 说说浏览器解析CSS选择器