java解析pdf
最近遇到需求解析pdf,具體需求為解析處pdf文件中的標題,內容,圖片,表格等信息,并分類存儲。查遍某度也沒有找到方法,問了https://www.e-iceblue.cn/的技術,也做不到。無奈只能按頁提取到文本和圖片了,而且順序也是錯亂的。代碼如下,希望能對類似需求的小伙伴有所幫助。不過雖然pdf沒搞定,但是搞定了word(doc、docx)的解析,后續會更新出來與大家分享。當然如果有小伙伴解決了pdf的解析還請多多指教需要依賴:
<dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>2.0.24</version>
</dependency>
<repository><id>com.e-iceblue</id><name>e-iceblue</name><url>http://repo.e-iceblue.cn/repository/maven-public/</url>
</repository>
測試代碼如下:
package com.aiwrite.file.utils.pdf;import com.aiwrite.common.core.constant.FileConstants;
import com.aiwrite.file.utils.FileUploadUtils;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import com.spire.pdf.exporting.PdfImageInfo;
import com.spire.pdf.widget.PdfPageCollection;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;import javax.imageio.ImageIO;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;/*** @BelongsProject: aiwrite* @BelongsPackage: com.aiwrite.file.utils* @Author: zhousc* @CreateTime: 2022-06-12 17:31* @Description: TODO* @Version: 1.0*/
public class ParsePdfUtilTest {public static void main(String[] args) {String basePath = "E:\\123\\";String pdfPath = basePath + "認證授權開發手冊.pdf";/* =================================================================================== */System.out.println(" ==================== pdf parse begin !!! ==================== ");long l = System.currentTimeMillis();try {// testPdf(pdfPath, basePath);printPdfInfo(pdfPath);// readPDF(pdfPath);} catch (Exception e) {e.printStackTrace();}long l1 = System.currentTimeMillis();System.out.println(" ==================== pdf parse success !!! 共耗時 " + (l1 -l) + " ms ==================== ");/* =================================================================================== */}public static void printPdfInfo(String filePath) throws Exception {//加載測試文檔PdfDocument pdf = new PdfDocument();pdf.loadFromFile(filePath);// 按每頁獲取文件內容PdfPageCollection pages = pdf.getPages();// pdf每頁對象PdfPageBase page;if (pages.getCount() > 0) {for (int i = 0; i < pages.getCount(); i++) {page = pages.get(i);System.out.println(" >>>>> >>>>> >>>>> >>>>> >>>>> 第 " + (i + 1) + " 頁 <<<<< <<<<< <<<<< <<<<< <<<<< ");System.out.println(" ***** ***** ***** ***** ***** 內容如下 ***** ***** ***** ***** ***** ");// 按原內容格式輸出String text = page.extractText(true);System.out.println(text.replace("Evaluation Warning : The document was created with Spire.PDF for Java.",""));// 判斷是否有圖片PdfImageInfo[] imagesInfo = page.getImagesInfo();for (PdfImageInfo pdfImageInfo : imagesInfo) {Rectangle2D bounds = pdfImageInfo.getBounds();int index = pdfImageInfo.getIndex();String base64 = FileUploadUtils.BufferedImageToBase64(pdfImageInfo.getImage());System.out.println(" ===== ===== ===== ===== ===== 圖片坐標信息如下 ===== ===== ===== ===== ===== ");System.out.println(" index: " + index + "; x: " +bounds.getX() + "; y: " + bounds.getY());
// System.out.println(" base64: " + base64);}
// BufferedImage[] bufferedImages = page.extractImages(true);
// if (ObjectUtils.isNotEmpty(bufferedImages)) {
// for (BufferedImage bufferedImage : bufferedImages) {
// System.out.println(" ===== ===== ===== ===== ===== 圖片信息如下 ===== ===== ===== ===== ===== ");
// System.out.println(bufferedImage);
// }
// }}}pdf.close();}public static void testPdf(String filePath, String outPath) throws IOException {//加載測試文檔,實例化StringBuilder類PdfDocument pdf = new PdfDocument(filePath);//定義一個int型變量StringBuilder sb = new StringBuilder();//遍歷PDF文檔中每頁int index = 0;PdfPageBase page;for (int i = 0; i < pdf.getPages().getCount(); i++) {page = pdf.getPages().get(i);//調用extractText()方法提取文本sb.append(page.extractText(true));FileWriter writer;try {//將StringBuilder對象中的文本寫入到txtwriter = new FileWriter(outPath + "ExtractText.txt");writer.write(sb.toString());writer.flush();} catch (IOException e) {e.printStackTrace();}//調用extractImages方法獲取圖片if (ObjectUtils.isNotEmpty(page.extractImages())) {//指定輸出圖片名,指定圖片格式for (BufferedImage image : page.extractImages()) {File output = new File(String.format(outPath + "Image_%d.png", index++));ImageIO.write(image, FileConstants.PICTURE_PNG, output);}}}pdf.close();}/*** 讀PDF文件,使用了pdfbox開源項目* @param fileName*/public static void readPDF(String fileName) {File file = new File(fileName);FileInputStream in = null;try {in = new FileInputStream(fileName);// 新建一個PDF解析器對象PDFParser parser = new PDFParser(new RandomAccessFile(file,"rw"));// 對PDF文件進行解析parser.parse();// 獲取解析后得到的PDF文檔對象PDDocument pdfdocument = parser.getPDDocument();// 新建一個PDF文本剝離器PDFTextStripper stripper = new PDFTextStripper();//sort設置為true 則按照行進行讀取,默認是falsestripper .setSortByPosition(true);// 從PDF文檔對象中剝離文本String result = stripper.getText(pdfdocument);// 寫入到文件
// FileWriter fileWriter = new FileWriter(new File("pdf.txt"));
// fileWriter.write(result);
// fileWriter.flush();
// fileWriter.close();System.out.println("PDF文件的文本內容如下:");System.out.println(result);} catch (Exception e) {System.out.println("讀取PDF文件" + file.getAbsolutePath() + "生失敗!" + e);e.printStackTrace();} finally {if (in != null) {try {in.close();} catch (IOException e1) {}}}}
}
總結
- 上一篇: 指纹识别技术
- 下一篇: 如何排查带宽超过限制?