Skip to content

Commit

Permalink
Format code
Browse files Browse the repository at this point in the history
  • Loading branch information
nguyenq committed Dec 3, 2023
1 parent 62d7238 commit 0ec859f
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 35 deletions.
3 changes: 0 additions & 3 deletions src/main/java/net/sourceforge/tess4j/util/Hocr2PdfParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
import javax.xml.parsers.SAXParserFactory;
import java.awt.*;
import java.io.*;
import java.util.HashMap;
import java.util.Map;

import static java.lang.Float.valueOf;

Expand Down Expand Up @@ -255,7 +253,6 @@ public Hocr2PdfParser(String hocrFilepath, PDDocument pdDocument, boolean visibl
}

public void parse() throws SAXException, IOException, ParserConfigurationException {

SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
XMLReader reader = parser.getXMLReader();
Expand Down
15 changes: 2 additions & 13 deletions src/main/java/net/sourceforge/tess4j/util/PdfBoxUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,6 @@
*/
package net.sourceforge.tess4j.util;

import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.slf4j.LoggerFactory;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FilenameFilter;
Expand Down Expand Up @@ -231,20 +223,17 @@ public static void mergePdf(File[] inputPdfFiles, File outputPdfFile) {

/**
* Merge text from hocr file into a pdf
*
* @param inputHocr input hocr file
* @param inputPdfStr input pdf file
* @param outputPdfStr ouput pdf file result of merging
* @param visible does the text are visible or not
* @throws Exception
*/
public static void mergeHocrIntoAPdf(String inputHocr, String inputPdfStr, String outputPdfStr, boolean visible) throws Exception {
try (
PDDocument pdDocument = Loader.loadPDF(new File(inputPdfStr))
) {
try (PDDocument pdDocument = Loader.loadPDF(new File(inputPdfStr))) {
final Hocr2PdfParser hocr2PdfParser = new Hocr2PdfParser(inputHocr, pdDocument, visible, false, null);

hocr2PdfParser.parse();

pdDocument.save(outputPdfStr);
}
}
Expand Down
23 changes: 4 additions & 19 deletions src/test/java/net/sourceforge/tess4j/util/PdfBoxUtilitiesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,61 +29,47 @@ public void setUp() {
}

@Test
void mergeHocrIntoAPdf_multiplePages() throws Exception {
public void mergeHocrIntoAPdf_multiplePages() throws Exception {
String hOcrFilename = String.format("%s/%s", this.testResourcesDataPath, "multipage-img.hocr");
String pdfFilename = String.format("%s/%s", this.testResourcesDataPath, "multipage-img.pdf");
String outputPdf = "target/test-classes/test-results/multipage-img-with-hocr.pdf";

PdfBoxUtilities.mergeHocrIntoAPdf(hOcrFilename, pdfFilename, outputPdf, false);

assertPdfContainsText("Auf der Registerkarte 'Einflgen' enthalten", outputPdf);
}

@Test
void mergeHocrIntoAPdf_singlePage() throws Exception {
public void mergeHocrIntoAPdf_singlePage() throws Exception {
String hOcrFilename = String.format("%s/%s", this.testResourcesDataPath, "eurotext.hocr");
String pdfFilename = String.format("%s/%s", this.testResourcesDataPath, "eurotext.pdf");
String outputPdf = "target/test-classes/test-results/eurotext-withHocr.pdf";

PdfBoxUtilities.mergeHocrIntoAPdf(hOcrFilename, pdfFilename, outputPdf, false);

assertPdfContainsText("The (quick) [brown]", outputPdf);
}

@Test
void mergeHocrIntoAPdf_createHocrThenMergeToPDF() throws Exception {
public void mergeHocrIntoAPdf_createHocrThenMergeToPDF() throws Exception {
String pdfFilename = String.format("%s/%s", this.testResourcesDataPath, "eurotext.pdf");
File imageFile1 = new File(pdfFilename);
String outputbase1 = "target/test-classes/test-results/docrenderer2-1";
String outputbase2 = "target/test-classes/test-results/docrenderer2-1-merge.pdf";

List<ITesseract.RenderedFormat> formats = new ArrayList<>(Arrays.asList(ITesseract.RenderedFormat.HOCR));

instance.createDocuments(new String[]{imageFile1.getPath()}, new String[]{outputbase1}, formats);

assertTrue(new File(outputbase1 + ".hocr").exists());

PdfBoxUtilities.mergeHocrIntoAPdf(outputbase1 + ".hocr", pdfFilename, outputbase2, false);

assertTrue(new File(outputbase2).exists());
assertPdfContainsText("The (quick) [brown]", outputbase2);
}

@Test
void mergeHocrIntoAPdf_createHocrOnMultipageThenMergeToPDF() throws Exception {
public void mergeHocrIntoAPdf_createHocrOnMultipageThenMergeToPDF() throws Exception {
String pdfFilename = String.format("%s/%s", this.testResourcesDataPath, "multipage-img.pdf");
File imageFile1 = new File(pdfFilename);
String outputbase1 = "target/test-classes/test-results/docrenderer2-2";
String outputbase2 = "target/test-classes/test-results/docrenderer2-2-merge.pdf";

List<ITesseract.RenderedFormat> formats = new ArrayList<>(Arrays.asList(ITesseract.RenderedFormat.HOCR));

instance.createDocuments(new String[]{imageFile1.getPath()}, new String[]{outputbase1}, formats);

assertTrue(new File(outputbase1 + ".hocr").exists());

PdfBoxUtilities.mergeHocrIntoAPdf(outputbase1 + ".hocr", pdfFilename, outputbase2, false);

assertPdfContainsText("Auf der Registerkarte 'Einflgen' enthalten", outputbase2);
}

Expand All @@ -92,5 +78,4 @@ private void assertPdfContainsText(String expectedString, String pdfFilepath) th
String extractText = new PDFTextStripper().getText(doc);
assertTrue(extractText.contains(expectedString));
}

}

0 comments on commit 0ec859f

Please sign in to comment.