From 4aa5983aa802e4d06594d6834294f2d07a42dd5b Mon Sep 17 00:00:00 2001 From: lopez Date: Mon, 21 Oct 2019 23:23:47 +0200 Subject: [PATCH] add doc/docx support via Apache POI --- build.gradle | 2 + .../grobid/core/document/DocumentSource.java | 139 +++++++++++++++++- .../grobid/core/engines/FullTextParser.java | 16 +- .../org/grobid/core/engines/HeaderParser.java | 22 ++- .../grobid/core/utilities/IOUtilities.java | 10 +- .../org/grobid/service/GrobidRestService.java | 90 +++++++++--- .../process/GrobidRestProcessFiles.java | 63 +++++++- .../src/main/resources/web/grobid/grobid.js | 57 ++++++- .../src/main/resources/web/index.html | 4 +- 9 files changed, 354 insertions(+), 49 deletions(-) diff --git a/build.gradle b/build.gradle index c332b7857d..5f1c2ff1fc 100644 --- a/build.gradle +++ b/build.gradle @@ -226,6 +226,8 @@ project("grobid-core") { compile 'javax.xml.bind:jaxb-api:2.3.0' compile 'black.ninia:jep:3.8.2' compile 'org.slf4j:slf4j-log4j12:1.7.25' + compile "fr.opensagres.xdocreport:org.apache.poi.xwpf.converter.pdf:1.0.6" + //compile group: 'org.apache.tika', name: 'tika-core', version: '1.22' shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1" diff --git a/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java b/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java index eab6dd4fe0..9e9c96918b 100644 --- a/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java +++ b/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java @@ -12,14 +12,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; +import java.io.*; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import org.apache.poi.xwpf.converter.pdf.PdfConverter; +import org.apache.poi.xwpf.converter.pdf.PdfOptions; +import org.apache.poi.xwpf.usermodel.XWPFDocument; + /** - * Input document to be processed, which could come from a PDF or directly be an XML file. - * If from a PDF document, this is the place where pdftoxml is called. + * Input document to be processed, which could come from a PDF, a doc/docx or directly be an XML file. + * If from a PDF document, this is the place where pdfalto is called. + * If from a doc/docx document, this is the place where a conversion with Apache POI is realized. */ public class DocumentSource { private static final Logger LOGGER = LoggerFactory.getLogger(DocumentSource.class); @@ -30,9 +35,10 @@ public class DocumentSource { public static final int PDFTOXML_FILES_AMOUNT_LIMIT = 5000; private File pdfFile; + private File docxFile; private File xmlFile; boolean cleanupXml = false; - + boolean cleanupPdf = false; private DocumentSource() { } @@ -41,6 +47,10 @@ public static DocumentSource fromPdf(File pdfFile) { return fromPdf(pdfFile, -1, -1); } + public static DocumentSource fromDocx(File docxFile) { + return fromDocx(docxFile, -1, -1); + } + /** * By default the XML extracted from the PDF is without images, to avoid flooding the grobid-home/tmp directory, * but with the extra annotation file and with outline @@ -49,6 +59,10 @@ public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage) { return fromPdf(pdfFile, startPage, endPage, false, true, false); } + public static DocumentSource fromDocx(File docxFile, int startPage, int endPage) { + return fromDocx(docxFile, startPage, endPage, false, true, false); + } + public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage, boolean withImages, boolean withAnnotations, boolean withOutline) { if (!pdfFile.exists() || pdfFile.isDirectory()) { @@ -71,6 +85,35 @@ public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage, return source; } + public static DocumentSource fromDocx(File docxFile, int startPage, int endPage, + boolean withImages, boolean withAnnotations, boolean withOutline) { + if (!docxFile.exists() || docxFile.isDirectory()) { + throw new GrobidException("Input doc/docx file " + docxFile + " does not exist or a directory", + GrobidExceptionStatus.BAD_INPUT_DATA); + } + + DocumentSource source = new DocumentSource(); + source.cleanupXml = true; + source.cleanupPdf = true; + + // preliminary convert doc/docx file into PDF + File pdfFile = source.docxToPdf(docxFile, GrobidProperties.getTempPath()); + // create an ALTO representation + if (pdfFile != null) { + try { + source.xmlFile = source.pdf2xml(null, false, startPage, endPage, pdfFile, + GrobidProperties.getTempPath(), withImages, withAnnotations, withOutline); + } catch (Exception e) { + source.close(withImages, withAnnotations, withOutline); + throw e; + } finally { + source.cleanPdfFile(pdfFile); + } + } + source.docxFile = docxFile; + return source; + } + private String getPdfToXmlCommand(boolean withImage, boolean withAnnotations, boolean withOutline) { StringBuilder pdfToXml = new StringBuilder(); pdfToXml.append(GrobidProperties.getPdfToXMLPath().getAbsolutePath()); @@ -351,11 +394,85 @@ private boolean cleanXmlFile(File pathToXml, boolean cleanImages, boolean cleanA return success; } + private boolean cleanPdfFile(File pathToPdf) { + boolean success = false; + try { + if (pathToPdf != null) { + if (pathToPdf.exists()) { + success = pathToPdf.delete(); + if (!success) { + throw new GrobidResourceException("Deletion of a temporary PDF file failed for file '" + pathToPdf.getAbsolutePath() + "'"); + } + } + } + } catch (Exception e) { + if (e instanceof GrobidResourceException) { + throw (GrobidResourceException) e; + } else { + throw new GrobidResourceException("An exception occurred while deleting an PDF file '" + pathToPdf + "'.", e); + } + } + + return success; + } + + /** + * Convert doc/docx file to pdf format using Apache POI (via opensagres converter). + * The current thread is used for the execution. + * + * @param docxPath docx/doc file + * @param tmpPath temp path to save the converted file + * @return the converted file or null if conversion was impossible/failed + */ + private File docxToPdf(File docxFile, File tmpPath) { + // target PDF file + if (docxFile == null || !docxFile.exists()) { + LOGGER.error("Invalid doc/docx file for PDF conversion"); + return null; + } + + File pdfFile = new File(tmpPath, KeyGen.getKey() + ".pdf"); + try ( + InputStream is = new FileInputStream(docxFile); + OutputStream out = new FileOutputStream(pdfFile); + ) { + long start = System.currentTimeMillis(); + // load the docx file into XWPFDocument + XWPFDocument document = new XWPFDocument(is); + // PDF options + PdfOptions options = PdfOptions.create(); + + // note: the default font encoding will be unicode, but it does not always work given the docx fonts, + // it is possible to set explicitely a font encoding like this: + // options = PdfOptions.create().fontEncoding("windows-1250"); + + // ensure PDF/A conformance level, for safer PDF processing by pdfalto + /*options.setConfiguration( new IPdfWriterConfiguration() { + public void configure( PdfWriter writer ) { + writer.setPDFXConformance( PdfWriter.PDFA1A ); + } + });*/ + + // converting XWPFDocument to PDF + PdfConverter.getInstance().convert(document, out, options); + LOGGER.info("docx file converted to PDF in : " + (System.currentTimeMillis() - start) + " milli seconds"); + + // TBD: for using the more recent version 2.0.2 of fr.opensagres.poi.xwpf.converter.core, see + // https://stackoverflow.com/questions/51330192/trying-to-make-simple-pdf-document-with-apache-poi + } catch (Throwable e) { + LOGGER.error("converting doc/docx into PDF failed", e); + pdfFile = null; + } + return pdfFile; + } public void close(boolean cleanImages, boolean cleanAnnotations, boolean cleanOutline) { try { if (cleanupXml) { cleanXmlFile(xmlFile, cleanImages, cleanAnnotations, cleanOutline); + } + if (cleanupPdf) { + cleanPdfFile(pdfFile); } } catch (Exception e) { LOGGER.error("Cannot cleanup resources (just printing exception):", e); @@ -369,7 +486,7 @@ public static void close(DocumentSource source, boolean cleanImages, boolean cle } public File getPdfFile() { - return pdfFile; + return this.pdfFile; } public void setPdfFile(File pdfFile) { @@ -377,13 +494,21 @@ public void setPdfFile(File pdfFile) { } public File getXmlFile() { - return xmlFile; + return this.xmlFile; } - public void setXmlFile(File xmlFile) { + public void setXmlFile(File docxFile) { this.xmlFile = xmlFile; } + public File getDocxFile() { + return this.xmlFile; + } + + public void setDocxFile(File docxFile) { + this.docxFile = docxFile; + } + } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 9bfaa61f03..8d0e21aba4 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -5,6 +5,7 @@ import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; import java.nio.charset.StandardCharsets; @@ -105,11 +106,18 @@ public FullTextParser(EngineParsers parsers) { tmpPath = GrobidProperties.getTempPath(); } - public Document processing(File inputPdf, + public Document processing(File input, GrobidAnalysisConfig config) throws Exception { - DocumentSource documentSource = - DocumentSource.fromPdf(inputPdf, config.getStartPage(), config.getEndPage(), - config.getPdfAssetPath() != null, true, false); + DocumentSource documentSource = null; + String extension = FilenameUtils.getExtension(input.getName()); + if ( extension != null && (extension.toLowerCase().equals("docx") || extension.toLowerCase().equals("doc")) ) { + documentSource = DocumentSource.fromDocx(input, config.getStartPage(), config.getEndPage(), + config.getPdfAssetPath() != null, true, false); + } else { + documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage(), + config.getPdfAssetPath() != null, true, false); + } + return processing(documentSource, config); } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 09ac4554b5..51501a30da 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -1,8 +1,11 @@ package org.grobid.core.engines; import com.google.common.base.Splitter; + import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.io.FilenameUtils; + import org.grobid.core.GrobidModels; import org.grobid.core.data.BiblioItem; import org.grobid.core.data.Date; @@ -75,7 +78,13 @@ public HeaderParser(EngineParsers parsers) { public Pair processing(File input, BiblioItem resHeader, GrobidAnalysisConfig config) { DocumentSource documentSource = null; try { - documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage()); + String extension = FilenameUtils.getExtension(input.getName()); + if ( extension != null && (extension.toLowerCase().equals("docx") || extension.toLowerCase().equals("doc")) ) { + documentSource = DocumentSource.fromDocx(input, config.getStartPage(), config.getEndPage()); + } else { + documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage()); + } + Document doc = parsers.getSegmentationParser().processing(documentSource, config); String tei = processingHeaderSection(config.getConsolidateHeader(), doc, resHeader); @@ -91,10 +100,17 @@ public Pair processing(File input, BiblioItem resHeader, Grobi * Processing without application of the segmentation model, regex are used to identify the header * zone. */ - public Pair processing2(String pdfInput, BiblioItem resHeader, GrobidAnalysisConfig config) { + public Pair processing2(String input, BiblioItem resHeader, GrobidAnalysisConfig config) { DocumentSource documentSource = null; try { - documentSource = DocumentSource.fromPdf(new File(pdfInput), config.getStartPage(), config.getEndPage()); + String extension = FilenameUtils.getExtension(input); + if ( extension != null && (extension.toLowerCase().equals("docx") || extension.toLowerCase().equals("doc")) ) { + documentSource = DocumentSource.fromDocx(new File(input), config.getStartPage(), config.getEndPage()); + } + else { + documentSource = DocumentSource.fromPdf(new File(input), config.getStartPage(), config.getEndPage()); + } + Document doc = new Document(documentSource); doc.addTokenizedDocument(config); diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/IOUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/IOUtilities.java index 4e8c7fe195..518b983904 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/IOUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/IOUtilities.java @@ -57,15 +57,19 @@ public static String readFile(String pPathToFile) throws IOException { } /** - * Write an input stream in temp directory. + * Write an input stream in temp directory, default is PDF file */ public static File writeInputFile(InputStream inputStream) { + return writeInputFile(inputStream, "pdf"); + } + + public static File writeInputFile(InputStream inputStream, String extension) { LOGGER.debug(">> set origin document for stateless service'..."); File originFile = null; OutputStream out = null; try { - originFile = newTempFile("origin", ".pdf"); + originFile = newTempFile("origin", extension); out = new FileOutputStream(originFile); @@ -99,6 +103,8 @@ public static File writeInputFile(InputStream inputStream) { */ public static File newTempFile(String fileName, String extension) { try { + if (!extension.startsWith(".")) + extension = "." + extension; return File.createTempFile(fileName, extension, GrobidProperties.getTempPath()); } catch (IOException e) { throw new GrobidResourceException( diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 6694c6972a..7132a83518 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -5,6 +5,7 @@ import com.google.inject.Singleton; import org.glassfish.jersey.media.multipart.FormDataBodyPart; import org.glassfish.jersey.media.multipart.FormDataParam; +import org.glassfish.jersey.media.multipart.FormDataContentDisposition; import org.grobid.core.factory.AbstractEngineFactory; import org.grobid.core.utilities.GrobidProperties; import org.grobid.core.engines.Engine; @@ -144,19 +145,41 @@ public Response getAdmin_htmlGet(@QueryParam(SHA1) String sha1) { @Produces(MediaType.APPLICATION_XML) @POST public Response processHeaderDocument_post(@FormDataParam(INPUT) InputStream inputStream, + @FormDataParam(INPUT) FormDataContentDisposition fileDetail, @FormDataParam("consolidateHeader") String consolidate) { int consol = validateConsolidationParam(consolidate); - return restProcessFiles.processStatelessHeaderDocument(inputStream, consol); + String fileName = fileDetail.getFileName(); + + return restProcessFiles.processStatelessHeaderDocument(inputStream, fileName, consol); } + /*@Path(PATH_HEADER) + @Consumes(MediaType.MULTIPART_FORM_DATA) + @Produces(MediaType.APPLICATION_XML) + @POST + public Response processHeaderDocument_post(@FormDataParam(INPUT) FormDataBodyPart body, + @FormDataParam("consolidateHeader") String consolidate) { + System.out.println(body.getMediaType()); + int consol = validateConsolidationParam(consolidate); + + InputStream inputStream = body.getEntityAs(InputStream.class); + + FormDataContentDisposition fileDetail = body.getFormDataContentDisposition(); + System.out.println(fileDetail.toString()); + System.out.println(fileDetail.getFileName()); + + return restProcessFiles.processStatelessHeaderDocument(inputStream, consol); + }*/ + @Path(PATH_HEADER) @Consumes(MediaType.MULTIPART_FORM_DATA) @Produces(MediaType.APPLICATION_XML) @PUT public Response processStatelessHeaderDocument(@FormDataParam(INPUT) InputStream inputStream, + @FormDataParam(INPUT) FormDataContentDisposition fileDetail, @FormDataParam("consolidateHeader") String consolidate) { - return processHeaderDocument_post(inputStream, consolidate); + return processHeaderDocument_post(inputStream, fileDetail, consolidate); } @Path(PATH_FULL_TEXT) @@ -164,6 +187,7 @@ public Response processStatelessHeaderDocument(@FormDataParam(INPUT) InputStream @Produces(MediaType.APPLICATION_XML) @POST public Response processFulltextDocument_post(@FormDataParam(INPUT) InputStream inputStream, + @FormDataParam(INPUT) FormDataContentDisposition fileDetail, @FormDataParam("consolidateHeader") String consolidateHeader, @FormDataParam("consolidateCitations") String consolidateCitations, @FormDataParam("includeRawCitations") String includeRawCitations, @@ -171,7 +195,8 @@ public Response processFulltextDocument_post(@FormDataParam(INPUT) InputStream i @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("teiCoordinates") List coordinates) throws Exception { - return processFulltext(inputStream, consolidateHeader, consolidateCitations, includeRawCitations, startPage, endPage, generateIDs, coordinates); + return processFulltext(inputStream, fileDetail, consolidateHeader, consolidateCitations, includeRawCitations, + startPage, endPage, generateIDs, coordinates); } @Path(PATH_FULL_TEXT) @@ -179,6 +204,7 @@ public Response processFulltextDocument_post(@FormDataParam(INPUT) InputStream i @Produces(MediaType.APPLICATION_XML) @PUT public Response processFulltextDocument(@FormDataParam(INPUT) InputStream inputStream, + @FormDataParam(INPUT) FormDataContentDisposition fileDetail, @FormDataParam("consolidateHeader") String consolidateHeader, @FormDataParam("consolidateCitations") String consolidateCitations, @FormDataParam("includeRawCitations") String includeRawCitations, @@ -186,13 +212,15 @@ public Response processFulltextDocument(@FormDataParam(INPUT) InputStream inputS @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("teiCoordinates") List coordinates) throws Exception { - return processFulltext(inputStream, consolidateHeader, consolidateCitations, includeRawCitations, startPage, endPage, generateIDs, coordinates); + return processFulltext(inputStream, fileDetail, consolidateHeader, consolidateCitations, includeRawCitations, + startPage, endPage, generateIDs, coordinates); } private Response processFulltext(InputStream inputStream, - @FormDataParam("consolidateHeader") String consolidateHeader, - @FormDataParam("consolidateCitations") String consolidateCitations, - @FormDataParam("includeRawCitations") String includeRawCitations, + FormDataContentDisposition fileDetail, + String consolidateHeader, + String consolidateCitations, + String includeRawCitations, int startPage, int endPage, String generateIDs, @@ -203,9 +231,12 @@ private Response processFulltext(InputStream inputStream, boolean includeRaw = validateIncludeRawParam(includeRawCitations); boolean generate = validateGenerateIdParam(generateIDs); + String fileName = fileDetail.getFileName(); + List teiCoordinates = collectCoordinates(coordinates); - return restProcessFiles.processFulltextDocument(inputStream, consolHeader, consolCitations, includeRaw, startPage, endPage, generate, teiCoordinates); + return restProcessFiles.processFulltextDocument(inputStream, fileName, consolHeader, consolCitations, includeRaw, + startPage, endPage, generate, teiCoordinates); } private List collectCoordinates(List coordinates) { @@ -252,13 +283,16 @@ private int validateConsolidationParam(String consolidate) { @Produces("application/zip") @POST public Response processFulltextAssetDocument_post(@FormDataParam(INPUT) InputStream inputStream, + @FormDataParam(INPUT) FormDataContentDisposition fileDetail, @FormDataParam("consolidateHeader") String consolidateHeader, @FormDataParam("consolidateCitations") String consolidateCitations, @FormDataParam("includeRawCitations") String includeRawCitations, @DefaultValue("-1") @FormDataParam("start") int startPage, @DefaultValue("-1") @FormDataParam("end") int endPage, - @FormDataParam("generateIDs") String generateIDs) throws Exception { - return processStatelessFulltextAssetHelper(inputStream, consolidateHeader, consolidateCitations, includeRawCitations, startPage, endPage, generateIDs); + @FormDataParam("generateIDs") String generateIDs, + @FormDataParam("teiCoordinates") List coordinates) throws Exception { + return processStatelessFulltextAssetHelper(inputStream, fileDetail, consolidateHeader, consolidateCitations, includeRawCitations, + startPage, endPage, generateIDs, coordinates); } @Path(PATH_FULL_TEXT_ASSET) @@ -266,29 +300,39 @@ public Response processFulltextAssetDocument_post(@FormDataParam(INPUT) InputStr @Produces("application/zip") @PUT public Response processStatelessFulltextAssetDocument(@FormDataParam(INPUT) InputStream inputStream, + @FormDataParam(INPUT) FormDataContentDisposition fileDetail, @FormDataParam("consolidateHeader") String consolidateHeader, @FormDataParam("consolidateCitations") String consolidateCitations, @FormDataParam("includeRawCitations") String includeRawCitations, @DefaultValue("-1") @FormDataParam("start") int startPage, @DefaultValue("-1") @FormDataParam("end") int endPage, - @FormDataParam("generateIDs") String generateIDs) throws Exception { - return processStatelessFulltextAssetHelper(inputStream, consolidateHeader, consolidateCitations, includeRawCitations, startPage, endPage, generateIDs); + @FormDataParam("generateIDs") String generateIDs, + @FormDataParam("teiCoordinates") List coordinates) throws Exception { + return processStatelessFulltextAssetHelper(inputStream, fileDetail, consolidateHeader, consolidateCitations, includeRawCitations, + startPage, endPage, generateIDs, coordinates); } private Response processStatelessFulltextAssetHelper(InputStream inputStream, - String consolidateHeader, - String consolidateCitations, - String includeRawCitations, - int startPage, - int endPage, - String generateIDs) throws Exception { + FormDataContentDisposition fileDetail, + String consolidateHeader, + String consolidateCitations, + String includeRawCitations, + int startPage, + int endPage, + String generateIDs, + List coordinates) throws Exception { int consolHeader = validateConsolidationParam(consolidateHeader); int consolCitations = validateConsolidationParam(consolidateCitations); boolean includeRaw = validateIncludeRawParam(includeRawCitations); boolean generate = validateGenerateIdParam(generateIDs); - return restProcessFiles.processStatelessFulltextAssetDocument(inputStream, consolHeader, consolCitations, includeRaw, startPage, endPage, generate); + String fileName = fileDetail.getFileName(); + + List teiCoordinates = collectCoordinates(coordinates); + + return restProcessFiles.processStatelessFulltextAssetDocument(inputStream, fileName, consolHeader, consolCitations, includeRaw, + startPage, endPage, generate, teiCoordinates); } /*@Path(PATH_CITATION_PATENT_TEI) @@ -519,11 +563,13 @@ public Response changePropertyValueGet(@QueryParam(XML) String xml) { @Produces(MediaType.APPLICATION_XML) @POST public Response processReferencesDocument_post(@FormDataParam(INPUT) InputStream inputStream, + @FormDataParam(INPUT) FormDataContentDisposition fileDetail, @FormDataParam("consolidateCitations") String consolidate, @FormDataParam("includeRawCitations") String includeRawCitations) { int consol = validateConsolidationParam(consolidate); boolean includeRaw = validateIncludeRawParam(includeRawCitations); - return restProcessFiles.processStatelessReferencesDocument(inputStream, consol, includeRaw); + String fileName = fileDetail.getFileName(); + return restProcessFiles.processStatelessReferencesDocument(inputStream, fileName, consol, includeRaw); } @Path(PATH_REFERENCES) @@ -531,11 +577,13 @@ public Response processReferencesDocument_post(@FormDataParam(INPUT) InputStream @Produces(MediaType.APPLICATION_XML) @PUT public Response processStatelessReferencesDocument(@FormDataParam(INPUT) InputStream inputStream, + @FormDataParam(INPUT) FormDataContentDisposition fileDetail, @FormDataParam("consolidateCitations") String consolidate, @FormDataParam("includeRawCitations") String includeRawCitations) { int consol = validateConsolidationParam(consolidate); boolean includeRaw = validateIncludeRawParam(includeRawCitations); - return restProcessFiles.processStatelessReferencesDocument(inputStream, consol, includeRaw); + String fileName = fileDetail.getFileName(); + return restProcessFiles.processStatelessReferencesDocument(inputStream, fileName, consol, includeRaw); } @Path(PATH_PDF_ANNOTATION) diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java index 8a3ef759ee..b1c157b77f 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java @@ -63,10 +63,11 @@ public GrobidRestProcessFiles() { * extracts only the header data. * * @param inputStream the data of origin document + * @param fileName filename of the origin document * @param consolidate consolidation parameter for the header extraction * @return a response object which contains a TEI representation of the header part */ - public Response processStatelessHeaderDocument(final InputStream inputStream, final int consolidate) { + public Response processStatelessHeaderDocument(final InputStream inputStream, final String fileName, final int consolidate) { LOGGER.debug(methodLogIn()); String retVal = null; Response response = null; @@ -80,12 +81,22 @@ public Response processStatelessHeaderDocument(final InputStream inputStream, fi "No GROBID engine available", Status.SERVICE_UNAVAILABLE); } - originFile = IOUtilities.writeInputFile(inputStream); + String extension = "pdf"; + if (fileName != null) { + String fileNameLow = fileName.toLowerCase(); + if (fileNameLow.endsWith("docx")) + extension = "docx"; + else if (fileNameLow.endsWith("doc")) + extension = "doc"; + } + + // the tmp file will have an unambiguous file extension (e.g. .pdf. .docx, .doc, ...) + originFile = IOUtilities.writeInputFile(inputStream, extension); if (originFile == null) { LOGGER.error("The input file cannot be written."); throw new GrobidServiceException( "The input file cannot be written. ", Status.INTERNAL_SERVER_ERROR); - } + } // starts conversion process retVal = engine.processHeader(originFile.getAbsolutePath(), consolidate, null); @@ -123,20 +134,25 @@ public Response processStatelessHeaderDocument(final InputStream inputStream, fi * Uploads the origin document which shall be extracted into TEI. * * @param inputStream the data of origin document + * @param fileName filename of the origin document * @param consolidateHeader the consolidation option allows GROBID to exploit Crossref * for improving header information * @param consolidateCitations the consolidation option allows GROBID to exploit Crossref * for improving citations information + * @param includeRawCitations if true, add the orginal full raw citation to every bibliographical + * reference structures * @param startPage give the starting page to consider in case of segmentation of the * PDF, -1 for the first page (default) * @param endPage give the end page to consider in case of segmentation of the * PDF, -1 for the last page (default) * @param generateIDs if true, generate random attribute id on the textual elements of * the resulting TEI + * @param teiCoordinates list of TEI XML elements to be enriched with the original PDF coordinates * @return a response object mainly contain the TEI representation of the * full text */ public Response processFulltextDocument(final InputStream inputStream, + final String fileName, final int consolidateHeader, final int consolidateCitations, final boolean includeRawCitations, @@ -158,7 +174,17 @@ public Response processFulltextDocument(final InputStream inputStream, "No GROBID engine available", Status.SERVICE_UNAVAILABLE); } - originFile = IOUtilities.writeInputFile(inputStream); + String extension = "pdf"; + if (fileName != null) { + String fileNameLow = fileName.toLowerCase(); + if (fileNameLow.endsWith("docx")) + extension = "docx"; + else if (fileNameLow.endsWith("doc")) + extension = "doc"; + } + + // the tmp file will have an unambiguous file extension (e.g. .pdf. .docx, .doc, ...) + originFile = IOUtilities.writeInputFile(inputStream, extension); if (originFile == null) { LOGGER.error("The input file cannot be written."); throw new GrobidServiceException( @@ -227,12 +253,14 @@ public Response processFulltextDocument(final InputStream inputStream, * full text */ public Response processStatelessFulltextAssetDocument(final InputStream inputStream, + final String fileName, final int consolidateHeader, final int consolidateCitations, final boolean includeRawCitations, final int startPage, final int endPage, - final boolean generateIDs) throws Exception { + final boolean generateIDs, + final List teiCoordinates) throws Exception { LOGGER.debug(methodLogIn()); Response response = null; String retVal = null; @@ -247,7 +275,17 @@ public Response processStatelessFulltextAssetDocument(final InputStream inputStr "No GROBID engine available", Status.SERVICE_UNAVAILABLE); } - originFile = IOUtilities.writeInputFile(inputStream); + String extension = "pdf"; + if (fileName != null) { + String fileNameLow = fileName.toLowerCase(); + if (fileNameLow.endsWith("docx")) + extension = "docx"; + else if (fileNameLow.endsWith("doc")) + extension = "doc"; + } + + // the tmp file will have an unambiguous file extension (e.g. .pdf. .docx, .doc, ...) + originFile = IOUtilities.writeInputFile(inputStream, extension); if (originFile == null) { LOGGER.error("The input file cannot be written."); throw new GrobidServiceException( @@ -267,6 +305,7 @@ public Response processStatelessFulltextAssetDocument(final InputStream inputStr .endPage(endPage) .generateTeiIds(generateIDs) .pdfAssetPath(new File(assetPath)) + .generateTeiCoordinates(teiCoordinates) .build(); retVal = engine.fullTextToTEI(originFile, config); @@ -480,6 +519,7 @@ public Response processCitationPatentST36(final InputStream inputStream, * full text */ public Response processStatelessReferencesDocument(final InputStream inputStream, + final String fileName, final int consolidate, final boolean includeRawCitations) { LOGGER.debug(methodLogIn()); @@ -495,7 +535,16 @@ public Response processStatelessReferencesDocument(final InputStream inputStream "No GROBID engine available", Status.SERVICE_UNAVAILABLE); } - originFile = IOUtilities.writeInputFile(inputStream); + String extension = "pdf"; + if (fileName != null) { + String fileNameLow = fileName.toLowerCase(); + if (fileNameLow.endsWith("docx")) + extension = "docx"; + else if (fileNameLow.endsWith("doc")) + extension = "doc"; + } + + originFile = IOUtilities.writeInputFile(inputStream, extension); if (originFile == null) { LOGGER.error("The input file cannot be written."); throw new GrobidServiceException( diff --git a/grobid-service/src/main/resources/web/grobid/grobid.js b/grobid-service/src/main/resources/web/grobid/grobid.js index 1d61ca6dc2..1c2c9e7771 100644 --- a/grobid-service/src/main/resources/web/grobid/grobid.js +++ b/grobid-service/src/main/resources/web/grobid/grobid.js @@ -20,7 +20,7 @@ var grobid = (function($) { } else { baseUrl = $(location).attr('href') + "api/" + ext; } - console.log("BaseURL: " + baseUrl); + //console.log("BaseURL: " + baseUrl); return baseUrl; } @@ -74,13 +74,14 @@ var grobid = (function($) { return true; }); - $('#gbdForm').ajaxForm({ + /*$('#gbdForm').ajaxForm({ beforeSubmit: ShowRequest1, success: SubmitSuccesful, error: AjaxError1, dataType: "text" - }); + });*/ + $('#submitRequest1').bind('click', submitQuery1); $('#submitRequest2').bind('click', submitQuery2); $('#submitRequest3').bind('click', submitQuery3); @@ -282,6 +283,56 @@ var grobid = (function($) { $("#btn_download").show(); } + function submitQuery1() { + var selected = $('#selectedService option:selected').attr('value'); + var url = $('#gbdForm').attr('action'); + var form = document.getElementById('gbdForm'); + var formData = new FormData(form); + var xhr = new XMLHttpRequest(); + xhr.responseType = 'text'; + var payload = null; + if (selected === 'processHeaderDocument' || selected === 'processFulltextDocument' || selected === 'processReferences') { + + // use file input in the form data + //var form = document.getElementById('gbdForm'); + formData.delete('inputText'); + xhr.open('POST', url, true); + payload = formData; // multipart/form-data + } else { + // use text input in the form data + formData.delete('input'); + + var urlEncodedData = ""; + var urlEncodedDataPairs = []; + var name; + + for (var pair of formData.entries()) { + urlEncodedDataPairs.push(encodeURIComponent(pair[0]) + '=' + encodeURIComponent(pair[1])); + } + var value = formData.get('inputText'); + //var value = $('#textInputArea').val() + if (value) + urlEncodedDataPairs.push(encodeURIComponent('input') + '=' + encodeURIComponent(value)); + urlEncodedData = urlEncodedDataPairs.join('&').replace(/%20/g, '+'); + xhr.open('POST', url, true); + xhr.setRequestHeader('Content-type', 'application/x-www-form-urlencoded'); + payload = urlEncodedData; // application/x-www-form-urlencoded + } + + ShowRequest1(); + xhr.onreadystatechange = function (e) { + if (xhr.readyState == 4) { + if (xhr.status == 200) { + var response = e.target.response; + SubmitSuccesful(xhr.responseText, xhr.status, xhr); + } else { + AjaxError1("Response " + xhr.status + ": " ); + } + } + } + xhr.send(payload); + } + function submitQuery2() { var selected = $('#selectedService2 option:selected').attr('value'); if (selected == 'annotatePDF') { diff --git a/grobid-service/src/main/resources/web/index.html b/grobid-service/src/main/resources/web/index.html index 6801485c68..c91d22108c 100644 --- a/grobid-service/src/main/resources/web/index.html +++ b/grobid-service/src/main/resources/web/index.html @@ -135,7 +135,7 @@

@@ -145,7 +145,7 @@

-
- +  
  +