图像文字识别(二):java调用tesseract 识别图片文字
生活随笔
收集整理的這篇文章主要介紹了
图像文字识别(二):java调用tesseract 识别图片文字
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
? 在JAVA中調(diào)用tesseract識(shí)別圖片的文字內(nèi)容,主要有兩種方式:cmd方式,tess4j方式。在這篇博客中,主要記錄一下通過cmd命令行的方式。cmd方式,就是通過在java中調(diào)用命令行,來執(zhí)行tesseract,它的原理就是上篇博客所寫的內(nèi)容。
步驟:
(1)導(dǎo)入兩個(gè)jar包:jai_imageio-1.1.1.jar 和 swingx-1.6.1.jar(2)編寫ImageIOHelper類,用于創(chuàng)建臨時(shí)圖片文件,防止損壞初始文件
import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.Locale;import javax.imageio.IIOImage; import javax.imageio.ImageIO; import javax.imageio.ImageReader; import javax.imageio.ImageWriteParam; import javax.imageio.ImageWriter; import javax.imageio.metadata.IIOMetadata; import javax.imageio.stream.ImageInputStream; import javax.imageio.stream.ImageOutputStream;import com.sun.media.imageio.plugins.tiff.TIFFImageWriteParam; /** * 類說明 :創(chuàng)建臨時(shí)圖片文件防止損壞初始文件 */ public class ImageIOHelper {//設(shè)置語言private Locale locale=Locale.CHINESE;//自定義語言構(gòu)造的方法public ImageIOHelper(Locale locale){this.locale=locale;}//默認(rèn)構(gòu)造器Locale.CHINESEpublic ImageIOHelper(){}/*** 創(chuàng)建臨時(shí)圖片文件防止損壞初始文件* @param imageFile* @param imageFormat like png,jps .etc* @return TempFile of Image*/public File createImage(File imageFile, String imageFormat) throws IOException {//讀取圖片文件Iterator<ImageReader> readers = ImageIO.getImageReadersByFormatName(imageFormat); ImageReader reader = readers.next();//獲取文件流ImageInputStream iis = ImageIO.createImageInputStream(imageFile);reader.setInput(iis);IIOMetadata streamMetadata = reader.getStreamMetadata(); //設(shè)置writeParamTIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.CHINESE); tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); //設(shè)置可否壓縮 //獲得tiffWriter和設(shè)置outputIterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("tiff"); ImageWriter writer = writers.next(); BufferedImage bi = reader.read(0); IIOImage image = new IIOImage(bi,null,reader.getImageMetadata(0)); File tempFile = tempImageFile(imageFile); ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile); writer.setOutput(ios); writer.write(streamMetadata, image, tiffWriteParam); ios.close();iis.close();writer.dispose(); reader.dispose(); return tempFile; } /*** 給tempfile添加后綴* @param imageFile* @throws IOException */private File tempImageFile(File imageFile) throws IOException { String path = imageFile.getPath(); StringBuffer strB = new StringBuffer(path); strB.insert(path.lastIndexOf('.'),"_text_recognize_temp");String s=strB.toString().replaceFirst("(?<=//.)(//w+)$", "tif");Runtime.getRuntime().exec("attrib "+"\""+s+"\""+" +H"); //設(shè)置文件隱藏return new File(strB.toString()); } }(3)創(chuàng)建OCRUtil工具類,用于進(jìn)行圖片文字識(shí)別:
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.Locale;import org.jdesktop.swingx.util.OS;/** * 類說明:OCR工具類 */ public class OCRUtil {private final String LANG_OPTION = "-l"; //英文字母小寫l,并非阿拉伯?dāng)?shù)字1 private final String EOL = System.getProperty("line.separator"); private String tessPath = "D://Tesseract//Tsseract-OCR//Tesseract-OCR";//ocr的安裝路徑public OCRUtil(String tessPath,String transFileName){this.tessPath=tessPath;}//OCRUtil的構(gòu)造方法,默認(rèn)路徑是"C://Program Files (x86)//Tesseract-OCR"public OCRUtil(){ }public String getTessPath() {return tessPath;}public void setTessPath(String tessPath) {this.tessPath = tessPath;}public String getLANG_OPTION() {return LANG_OPTION;}public String getEOL() {return EOL;}/*** @param 需要識(shí)別的文件* @param 文件的格式* @return 識(shí)別后的文字*/public String recognizeText(File imageFile,String imageFormat)throws Exception{ File tempImage = new ImageIOHelper().createImage(imageFile,imageFormat); return ocrImages(tempImage, imageFile); } //可以自定義語言public String recognizeText(File imageFile,String imageFormat,Locale locale)throws Exception{ File tempImage = new ImageIOHelper(locale).createImage(imageFile,imageFormat);return ocrImages(tempImage, imageFile);}/*** @param 臨時(shí)文件* @param 需要識(shí)別的文件* @return 識(shí)別后的內(nèi)容* @throws IOException* @throws InterruptedException*/private String ocrImages(File tempImage,File imageFile) throws IOException, InterruptedException{//設(shè)置輸出文件的保存的文件目錄,以及文件名File outputFile = new File(imageFile.getParentFile(),"test");StringBuffer strB = new StringBuffer(); //設(shè)置命令行內(nèi)容List<String> cmd = new ArrayList<String>(); if(OS.isWindowsXP()){ cmd.add(tessPath+"//tesseract"); }else if(OS.isLinux()){ cmd.add("tesseract"); }else{ cmd.add(tessPath+"//tesseract"); } cmd.add(""); cmd.add(outputFile.getName()); cmd.add(LANG_OPTION); cmd.add("chi_sim");//中文包c(diǎn)md.add("equ");//常用數(shù)學(xué)公式包c(diǎn)md.add("eng");//英語包//創(chuàng)建操作系統(tǒng)進(jìn)程ProcessBuilder pb = new ProcessBuilder(); pb.directory(imageFile.getParentFile());//設(shè)置此進(jìn)程生成器的工作目錄 cmd.set(1, tempImage.getName()); pb.command(cmd);//設(shè)置要執(zhí)行的cmd命令 pb.redirectErrorStream(true);//設(shè)置后續(xù)子進(jìn)程生成的錯(cuò)誤輸出都將與標(biāo)準(zhǔn)輸出合并 long startTime = System.currentTimeMillis();System.out.println("開始時(shí)間:" + startTime);Process process = pb.start();//開始執(zhí)行,并返回進(jìn)程實(shí)例 //最終執(zhí)行命令為:tesseract 1.png test -l chi_sim+equ+engint w = process.waitFor(); tempImage.delete();//刪除臨時(shí)正在工作文件 if(w==0){ // 0代表正常退出BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath()+".txt"),"UTF-8")); String str; while((str = in.readLine())!=null){ strB.append(str).append(EOL); } in.close(); long endTime = System.currentTimeMillis();System.out.println("結(jié)束時(shí)間:" + endTime);System.out.println("耗時(shí):" + (endTime - startTime) + "毫秒");}else{ String msg; switch(w){ case 1: msg = "Errors accessing files.There may be spaces in your image's filename."; break; case 29: msg = "Cannot recongnize the image or its selected region."; break; case 31: msg = "Unsupported image format."; break; default: msg = "Errors occurred."; } tempImage.delete(); throw new RuntimeException(msg); } new File(outputFile.getAbsolutePath()+".txt");//.delete(); return strB.toString().replaceAll("\\s*", ""); } }(4)創(chuàng)建測(cè)試類Test:
import java.io.File; import java.io.IOException;/** * @version 創(chuàng)建時(shí)間:2018年4月25日 下午5:09:19 * 類說明:測(cè)試類 */ public class Test {public static void main(String[] args) {try {//圖片文件:此圖片是需要被識(shí)別的圖片路徑 File file = new File("C://Users//1_20180208150251_x4hzz//1.png");//String recognizeText = new OCRHelper().recognizeText(file);String recognizeText = new OCRUtil().recognizeText(file, "png");System.out.print(recognizeText + "\t");} catch (IOException e) {e.printStackTrace();} catch (Exception e) {e.printStackTrace();}} }至此,只要傳入需要識(shí)別的圖片,就可以識(shí)別出圖片中的文字的內(nèi)容了。
與50位技術(shù)專家面對(duì)面20年技術(shù)見證,附贈(zèng)技術(shù)全景圖總結(jié)
以上是生活随笔為你收集整理的图像文字识别(二):java调用tesseract 识别图片文字的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 图片文字识别(一):tesseract-
- 下一篇: 图像文字识别(三):Tesseract4