diff --git a/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh b/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh
index 516ebac..29d1a69 100755
--- a/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh
+++ b/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh
@@ -15,3 +15,5 @@ docker stack deploy --compose-file ./temp/${DOCKER_COMPOSE_FILE} ${TEXT_EXTRACTI
rm -rf ./temp
echo "Deploy routines have been completed"
+echo "Running OpenOffice server.."
+soffice -headless -accept="socket,host=localhost,port=8100;urp;" -nofirststartwizard
diff --git a/text_extraction_system/prepare_dev_env_ubuntu.sh b/text_extraction_system/prepare_dev_env_ubuntu.sh
index 76a88d8..8c34299 100755
--- a/text_extraction_system/prepare_dev_env_ubuntu.sh
+++ b/text_extraction_system/prepare_dev_env_ubuntu.sh
@@ -5,15 +5,31 @@ OS_VERSION=$(awk -F= '/^VERSION_ID/{print $2}' /etc/os-release)
PYTHON3_VERSION="$(python3 -V 2>&1)"
# Install python, office, maven and tesseract
-sudo apt-get install virtualenv python3-dev libreoffice maven tesseract-ocr tesseract-ocr-eng \
- tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu \
- tesseract-ocr-rus
+sudo apt-get install virtualenv python3-dev maven tesseract-ocr tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra \
+ tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-rus
+
+# Remove LibreOffice
+sudo apt remove libreoffice-base-core libreoffice-impress libreoffice-calc libreoffice-math libreoffice-common \
+ libreoffice-ogltrans libreoffice-core libreoffice-pdfimport libreoffice-draw libreoffice-style-breeze \
+ libreoffice-gnome libreoffice-style-colibre libreoffice-gtk3 libreoffice-style-elementary \
+ libreoffice-help-common libreoffice-style-tango libreoffice-help-en-us libreoffice-writer
+sudo apt autoremove
+
+# Install OpenOffice
+wget https://sourceforge.net/projects/openofficeorg.mirror/files/4.1.12/binaries/en-US/Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
+tar -zxvf Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
+sudo rm -rf Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
+cd en-US/DEBS/
+sudo dpkg -i *.deb
+cd desktop-integration
+sudo dpkg -i *.deb
+cd ../../..
+sudo rm -rf en-US
# Prepare python virtual env
virtualenv -p /usr/bin/python3 .venv
source .venv/bin/activate
-pip install -U wheel
-pip install -U setuptools
+pip install -U pip pipenv wheel setuptools
pip install -U -r requirements.txt
# Install additional python packages
@@ -21,9 +37,7 @@ pip install -U --no-deps -e ../../lexpredict-contraxsuite-core/
pip install -U -e ../text_extraction_system_api
# NLTK should be installed within lexpredict-contraxsuite-core. The following downloads its models
-python3 -m nltk.downloader averaged_perceptron_tagger punkt stopwords words maxent_ne_chunker \
- wordnet omw-1.4
-
+python3 -m nltk.downloader averaged_perceptron_tagger punkt stopwords words maxent_ne_chunker wordnet omw-1.4
deactivate
# Downloading model for language detection
diff --git a/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py b/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py
index a1e83a0..406a60d 100644
--- a/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py
+++ b/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py
@@ -1,30 +1,23 @@
import logging
import os
import shutil
+import subprocess
import tempfile
from contextlib import contextmanager
-from subprocess import CompletedProcess
+from subprocess import CompletedProcess, PIPE
from typing import Generator
from PIL import Image
from text_extraction_system.config import get_settings
-from text_extraction_system.locking.socket_lock import get_lock
-from text_extraction_system.pdf.errors import InputFileDoesNotExist, \
- OutputPDFDoesNotExistAfterConversion
-from text_extraction_system.pdf.utils import separate_filename_basename_and_extension, \
- run_process, prepare_large_data_file
+from text_extraction_system.pdf.errors import InputFileDoesNotExist, OutputPDFDoesNotExistAfterConversion
+from text_extraction_system.pdf.utils import separate_filename_basename_and_extension, run_process
from text_extraction_system.processes import raise_from_process, render_process_msg
log = logging.getLogger(__name__)
-SOFFICE_CALL_BASE_ARGUMENTS = ['--headless', '--invisible', '--nodefault', '--view',
- '--nolockcheck', '--nologo', '--norestore', '--nofirststartwizard', ]
-
-def convert_image_to_pdf(src_fn: str,
- out_fn: str,
- timeout_sec: int = 1800) -> CompletedProcess:
+def convert_image_to_pdf(src_fn: str, out_fn: str, timeout_sec: int = 1800) -> CompletedProcess:
"""
Converts image to pdf file using custom Java solution
"""
@@ -33,33 +26,8 @@ def convert_image_to_pdf(src_fn: str,
return run_process(args, timeout_sec)
-def soffice_convert_to_pdf(src_fn: str,
- directory: str,
- soffice_single_process_locking: bool = True,
- timeout_sec: int = 1800) -> CompletedProcess:
- """
- Converts image to pdf file using custom Java solution
- """
- with prepare_large_data_file(src_fn, directory) as prepared_fn:
- args = ['soffice', *SOFFICE_CALL_BASE_ARGUMENTS,
- '--convert-to', 'pdf',
- prepared_fn,
- '--outdir', directory]
-
- # Soffice does not allow running multiple copies of the process in environment.
- # The following is a workaround mostly for in-container usage.
- if soffice_single_process_locking:
- with get_lock('soffice_single_process',
- wait_required_listener=lambda: log.info(
- 'Waiting for another conversion task to finish first...')):
- return run_process(args, timeout_sec)
- return run_process(args, timeout_sec)
-
-
@contextmanager
-def convert_to_pdf(src_fn: str,
- soffice_single_process_locking: bool = True,
- timeout_sec: int = 1800) -> Generator[str, None, None]:
+def convert_to_pdf(src_fn: str, timeout_sec: int = 1800) -> Generator[str, None, None]:
"""
Converts the specified file to pdf file.
Soffice converter allows specifying the output directory. The output file name is generated
@@ -69,7 +37,6 @@ def convert_to_pdf(src_fn: str,
"""
if not os.path.isfile(src_fn) and not os.path.isfile(src_fn):
raise InputFileDoesNotExist(src_fn)
-
temp_dir = tempfile.mkdtemp()
source_fn = src_fn
src_fn, src_fn_base, src_ext = separate_filename_basename_and_extension(src_fn, temp_dir)
@@ -78,7 +45,6 @@ def convert_to_pdf(src_fn: str,
# Bypass pdf file
if src_ext == 'pdf':
return src_fn
-
try:
if src_ext.lower() in {'.tiff', '.jpg', '.jpeg', '.png'}:
if src_ext.lower() == '.png':
@@ -87,14 +53,16 @@ def convert_to_pdf(src_fn: str,
rgb_im.save(source_fn)
completed_process = convert_image_to_pdf(src_fn, out_fn, timeout_sec)
else:
- completed_process = soffice_convert_to_pdf(src_fn, temp_dir,
- soffice_single_process_locking, timeout_sec)
+ java_modules_path = get_settings().java_modules_path
+ args = ['java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.ConvertToPDF',
+ '--original-doc', src_fn,
+ '--dst-pdf', out_fn]
+ completed_process: CompletedProcess = subprocess.run(args, check=False, timeout=timeout_sec,
+ universal_newlines=True, stderr=PIPE, stdout=PIPE)
raise_from_process(log, completed_process, lambda: f'Converting {src_fn} to pdf.')
-
if not os.path.isfile(out_fn):
- raise OutputPDFDoesNotExistAfterConversion(
- f'Unable to convert {src_fn} to pdf. Output file does not exist after conversion.\n'
- + render_process_msg(completed_process))
+ raise OutputPDFDoesNotExistAfterConversion(f'Unable to convert {src_fn} to pdf. Output file does not exist '
+ f'after conversion.\n' + render_process_msg(completed_process))
yield out_fn
finally:
if os.path.isdir(temp_dir):
diff --git a/text_extraction_system/text_extraction_system_java/pom.xml b/text_extraction_system/text_extraction_system_java/pom.xml
index 652222e..af319f5 100644
--- a/text_extraction_system/text_extraction_system_java/pom.xml
+++ b/text_extraction_system/text_extraction_system_java/pom.xml
@@ -3,7 +3,6 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
-
org.example
text_extraction_system_java
1.3
@@ -38,7 +37,6 @@
-
org.apache.pdfbox
@@ -176,7 +174,20 @@
commons-cli
1.4
+
+ org.slf4j
+ slf4j-api
+ 1.6.1
+
+
+ org.slf4j
+ slf4j-simple
+ 1.6.1
+
+
+ com.artofsolving
+ jodconverter
+ 2.2.1
+
-
-
\ No newline at end of file
diff --git a/text_extraction_system/text_extraction_system_java/src/main/java/com/lexpredict/textextraction/ConvertToPDF.java b/text_extraction_system/text_extraction_system_java/src/main/java/com/lexpredict/textextraction/ConvertToPDF.java
new file mode 100644
index 0000000..90e95f1
--- /dev/null
+++ b/text_extraction_system/text_extraction_system_java/src/main/java/com/lexpredict/textextraction/ConvertToPDF.java
@@ -0,0 +1,61 @@
+package com.lexpredict.textextraction;
+
+import org.apache.commons.cli.*;
+
+import com.artofsolving.jodconverter.DocumentConverter;
+import com.artofsolving.jodconverter.DocumentFamily;
+import com.artofsolving.jodconverter.DocumentFormat;
+import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
+import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
+import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.ConnectException;
+
+
+public class ConvertToPDF {
+ public static void main(String[] args) throws IOException, ConnectException {
+ CommandLine cmd = parseCliArgs(args);
+ String src = cmd.getOptionValue("original-doc");
+ String dstPdf = cmd.getOptionValue("dst-pdf");
+
+ File inputFile = new File(src);
+ File outputFile = new File(dstPdf);
+
+ // connect to an OpenOffice.org instance running on port 8100
+ OpenOfficeConnection connection = new SocketOpenOfficeConnection(8100);
+ connection.connect();
+
+ // convert
+ DocumentConverter converter = new OpenOfficeDocumentConverter(connection);
+ final DocumentFormat docx = new DocumentFormat("Microsoft Word 2007 XML", DocumentFamily.TEXT, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx");
+ converter.convert(inputFile, docx, outputFile, null);
+
+ // close the connection
+ connection.disconnect();
+ }
+
+ protected static CommandLine parseCliArgs(String[] args) {
+ Options options = new Options();
+
+ Option originalPDF = new Option("orig", "original-doc", true, "Original document file to convert to PDF.");
+ originalPDF.setRequired(true);
+ options.addOption(originalPDF);
+
+ Option dstPDF = new Option("dst", "dst-pdf", true, "File name to save the resulting PDF.");
+ dstPDF.setRequired(true);
+ options.addOption(dstPDF);
+
+ CommandLineParser parser = new DefaultParser();
+ HelpFormatter formatter = new HelpFormatter();
+ try {
+ return parser.parse(options, args);
+ } catch (ParseException e) {
+ System.out.println(e.getMessage());
+ formatter.printHelp(ConvertToPDF.class.getName(), options);
+ System.exit(1);
+ }
+ return null;
+ }
+}
\ No newline at end of file