Skip to content

Commit

Permalink
add doc/docx support via Apache POI
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Oct 21, 2019
1 parent 4a7e4f3 commit 4aa5983
Show file tree
Hide file tree
Showing 9 changed files with 354 additions and 49 deletions.
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ project("grobid-core") {
compile 'javax.xml.bind:jaxb-api:2.3.0'
compile 'black.ninia:jep:3.8.2'
compile 'org.slf4j:slf4j-log4j12:1.7.25'
compile "fr.opensagres.xdocreport:org.apache.poi.xwpf.converter.pdf:1.0.6"
//compile group: 'org.apache.tika', name: 'tika-core', version: '1.22'

shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"

Expand Down
139 changes: 132 additions & 7 deletions grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,19 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.poi.xwpf.converter.pdf.PdfConverter;
import org.apache.poi.xwpf.converter.pdf.PdfOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

/**
* Input document to be processed, which could come from a PDF or directly be an XML file.
* If from a PDF document, this is the place where pdftoxml is called.
* Input document to be processed, which could come from a PDF, a doc/docx or directly be an XML file.
* If from a PDF document, this is the place where pdfalto is called.
* If from a doc/docx document, this is the place where a conversion with Apache POI is realized.
*/
public class DocumentSource {
private static final Logger LOGGER = LoggerFactory.getLogger(DocumentSource.class);
Expand All @@ -30,9 +35,10 @@ public class DocumentSource {
public static final int PDFTOXML_FILES_AMOUNT_LIMIT = 5000;

private File pdfFile;
private File docxFile;
private File xmlFile;
boolean cleanupXml = false;

boolean cleanupPdf = false;

private DocumentSource() {
}
Expand All @@ -41,6 +47,10 @@ public static DocumentSource fromPdf(File pdfFile) {
return fromPdf(pdfFile, -1, -1);
}

public static DocumentSource fromDocx(File docxFile) {
return fromDocx(docxFile, -1, -1);
}

/**
* By default the XML extracted from the PDF is without images, to avoid flooding the grobid-home/tmp directory,
* but with the extra annotation file and with outline
Expand All @@ -49,6 +59,10 @@ public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage) {
return fromPdf(pdfFile, startPage, endPage, false, true, false);
}

public static DocumentSource fromDocx(File docxFile, int startPage, int endPage) {
return fromDocx(docxFile, startPage, endPage, false, true, false);
}

public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage,
boolean withImages, boolean withAnnotations, boolean withOutline) {
if (!pdfFile.exists() || pdfFile.isDirectory()) {
Expand All @@ -71,6 +85,35 @@ public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage,
return source;
}

public static DocumentSource fromDocx(File docxFile, int startPage, int endPage,
boolean withImages, boolean withAnnotations, boolean withOutline) {
if (!docxFile.exists() || docxFile.isDirectory()) {
throw new GrobidException("Input doc/docx file " + docxFile + " does not exist or a directory",
GrobidExceptionStatus.BAD_INPUT_DATA);
}

DocumentSource source = new DocumentSource();
source.cleanupXml = true;
source.cleanupPdf = true;

// preliminary convert doc/docx file into PDF
File pdfFile = source.docxToPdf(docxFile, GrobidProperties.getTempPath());
// create an ALTO representation
if (pdfFile != null) {
try {
source.xmlFile = source.pdf2xml(null, false, startPage, endPage, pdfFile,
GrobidProperties.getTempPath(), withImages, withAnnotations, withOutline);
} catch (Exception e) {
source.close(withImages, withAnnotations, withOutline);
throw e;
} finally {
source.cleanPdfFile(pdfFile);
}
}
source.docxFile = docxFile;
return source;
}

private String getPdfToXmlCommand(boolean withImage, boolean withAnnotations, boolean withOutline) {
StringBuilder pdfToXml = new StringBuilder();
pdfToXml.append(GrobidProperties.getPdfToXMLPath().getAbsolutePath());
Expand Down Expand Up @@ -351,11 +394,85 @@ private boolean cleanXmlFile(File pathToXml, boolean cleanImages, boolean cleanA
return success;
}

private boolean cleanPdfFile(File pathToPdf) {
boolean success = false;
try {
if (pathToPdf != null) {
if (pathToPdf.exists()) {
success = pathToPdf.delete();
if (!success) {
throw new GrobidResourceException("Deletion of a temporary PDF file failed for file '" + pathToPdf.getAbsolutePath() + "'");
}
}
}
} catch (Exception e) {
if (e instanceof GrobidResourceException) {
throw (GrobidResourceException) e;
} else {
throw new GrobidResourceException("An exception occurred while deleting an PDF file '" + pathToPdf + "'.", e);
}
}

return success;
}

/**
* Convert doc/docx file to pdf format using Apache POI (via opensagres converter).
* The current thread is used for the execution.
*
* @param docxPath docx/doc file
* @param tmpPath temp path to save the converted file
* @return the converted file or null if conversion was impossible/failed
*/
private File docxToPdf(File docxFile, File tmpPath) {
// target PDF file
if (docxFile == null || !docxFile.exists()) {
LOGGER.error("Invalid doc/docx file for PDF conversion");
return null;
}

File pdfFile = new File(tmpPath, KeyGen.getKey() + ".pdf");
try (
InputStream is = new FileInputStream(docxFile);
OutputStream out = new FileOutputStream(pdfFile);
) {
long start = System.currentTimeMillis();
// load the docx file into XWPFDocument
XWPFDocument document = new XWPFDocument(is);
// PDF options
PdfOptions options = PdfOptions.create();

// note: the default font encoding will be unicode, but it does not always work given the docx fonts,
// it is possible to set explicitely a font encoding like this:
// options = PdfOptions.create().fontEncoding("windows-1250");

// ensure PDF/A conformance level, for safer PDF processing by pdfalto
/*options.setConfiguration( new IPdfWriterConfiguration() {
public void configure( PdfWriter writer ) {
writer.setPDFXConformance( PdfWriter.PDFA1A );
}
});*/

// converting XWPFDocument to PDF
PdfConverter.getInstance().convert(document, out, options);
LOGGER.info("docx file converted to PDF in : " + (System.currentTimeMillis() - start) + " milli seconds");

// TBD: for using the more recent version 2.0.2 of fr.opensagres.poi.xwpf.converter.core, see
// https://stackoverflow.com/questions/51330192/trying-to-make-simple-pdf-document-with-apache-poi
} catch (Throwable e) {
LOGGER.error("converting doc/docx into PDF failed", e);
pdfFile = null;
}
return pdfFile;
}

public void close(boolean cleanImages, boolean cleanAnnotations, boolean cleanOutline) {
try {
if (cleanupXml) {
cleanXmlFile(xmlFile, cleanImages, cleanAnnotations, cleanOutline);
}
if (cleanupPdf) {
cleanPdfFile(pdfFile);
}
} catch (Exception e) {
LOGGER.error("Cannot cleanup resources (just printing exception):", e);
Expand All @@ -369,21 +486,29 @@ public static void close(DocumentSource source, boolean cleanImages, boolean cle
}

public File getPdfFile() {
return pdfFile;
return this.pdfFile;
}

public void setPdfFile(File pdfFile) {
this.pdfFile = pdfFile;
}

public File getXmlFile() {
return xmlFile;
return this.xmlFile;
}

public void setXmlFile(File xmlFile) {
public void setXmlFile(File docxFile) {
this.xmlFile = xmlFile;
}

public File getDocxFile() {
return this.xmlFile;
}

public void setDocxFile(File docxFile) {
this.docxFile = docxFile;
}

}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;

import java.nio.charset.StandardCharsets;

Expand Down Expand Up @@ -105,11 +106,18 @@ public FullTextParser(EngineParsers parsers) {
tmpPath = GrobidProperties.getTempPath();
}

public Document processing(File inputPdf,
public Document processing(File input,
GrobidAnalysisConfig config) throws Exception {
DocumentSource documentSource =
DocumentSource.fromPdf(inputPdf, config.getStartPage(), config.getEndPage(),
config.getPdfAssetPath() != null, true, false);
DocumentSource documentSource = null;
String extension = FilenameUtils.getExtension(input.getName());
if ( extension != null && (extension.toLowerCase().equals("docx") || extension.toLowerCase().equals("doc")) ) {
documentSource = DocumentSource.fromDocx(input, config.getStartPage(), config.getEndPage(),
config.getPdfAssetPath() != null, true, false);
} else {
documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage(),
config.getPdfAssetPath() != null, true, false);
}

return processing(documentSource, config);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package org.grobid.core.engines;

import com.google.common.base.Splitter;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.io.FilenameUtils;

import org.grobid.core.GrobidModels;
import org.grobid.core.data.BiblioItem;
import org.grobid.core.data.Date;
Expand Down Expand Up @@ -75,7 +78,13 @@ public HeaderParser(EngineParsers parsers) {
public Pair<String, Document> processing(File input, BiblioItem resHeader, GrobidAnalysisConfig config) {
DocumentSource documentSource = null;
try {
documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage());
String extension = FilenameUtils.getExtension(input.getName());
if ( extension != null && (extension.toLowerCase().equals("docx") || extension.toLowerCase().equals("doc")) ) {
documentSource = DocumentSource.fromDocx(input, config.getStartPage(), config.getEndPage());
} else {
documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage());
}

Document doc = parsers.getSegmentationParser().processing(documentSource, config);

String tei = processingHeaderSection(config.getConsolidateHeader(), doc, resHeader);
Expand All @@ -91,10 +100,17 @@ public Pair<String, Document> processing(File input, BiblioItem resHeader, Grobi
* Processing without application of the segmentation model, regex are used to identify the header
* zone.
*/
public Pair<String, Document> processing2(String pdfInput, BiblioItem resHeader, GrobidAnalysisConfig config) {
public Pair<String, Document> processing2(String input, BiblioItem resHeader, GrobidAnalysisConfig config) {
DocumentSource documentSource = null;
try {
documentSource = DocumentSource.fromPdf(new File(pdfInput), config.getStartPage(), config.getEndPage());
String extension = FilenameUtils.getExtension(input);
if ( extension != null && (extension.toLowerCase().equals("docx") || extension.toLowerCase().equals("doc")) ) {
documentSource = DocumentSource.fromDocx(new File(input), config.getStartPage(), config.getEndPage());
}
else {
documentSource = DocumentSource.fromPdf(new File(input), config.getStartPage(), config.getEndPage());
}

Document doc = new Document(documentSource);
doc.addTokenizedDocument(config);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,19 @@ public static String readFile(String pPathToFile) throws IOException {
}

/**
* Write an input stream in temp directory.
* Write an input stream in temp directory, default is PDF file
*/
public static File writeInputFile(InputStream inputStream) {
return writeInputFile(inputStream, "pdf");
}

public static File writeInputFile(InputStream inputStream, String extension) {
LOGGER.debug(">> set origin document for stateless service'...");

File originFile = null;
OutputStream out = null;
try {
originFile = newTempFile("origin", ".pdf");
originFile = newTempFile("origin", extension);

out = new FileOutputStream(originFile);

Expand Down Expand Up @@ -99,6 +103,8 @@ public static File writeInputFile(InputStream inputStream) {
*/
public static File newTempFile(String fileName, String extension) {
try {
if (!extension.startsWith("."))
extension = "." + extension;
return File.createTempFile(fileName, extension, GrobidProperties.getTempPath());
} catch (IOException e) {
throw new GrobidResourceException(
Expand Down
Loading

0 comments on commit 4aa5983

Please sign in to comment.