LexPredict · DevIhor · Dec 23, 2022
diff --git a/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh b/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh
@@ -15,3 +15,5 @@ docker stack deploy --compose-file ./temp/${DOCKER_COMPOSE_FILE} ${TEXT_EXTRACTI
 rm -rf ./temp
 echo "Deploy routines have been completed"
 
+echo "Running OpenOffice server.."
+soffice -headless -accept="socket,host=localhost,port=8100;urp;" -nofirststartwizard
diff --git a/text_extraction_system/prepare_dev_env_ubuntu.sh b/text_extraction_system/prepare_dev_env_ubuntu.sh
@@ -5,25 +5,39 @@ OS_VERSION=$(awk -F= '/^VERSION_ID/{print $2}' /etc/os-release)
 PYTHON3_VERSION="$(python3 -V 2>&1)"
 
 # Install python, office, maven and tesseract
-sudo apt-get install virtualenv python3-dev libreoffice maven tesseract-ocr tesseract-ocr-eng \
-                     tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu \
-                     tesseract-ocr-rus
+sudo apt-get install virtualenv python3-dev maven tesseract-ocr tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra \
+                     tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-rus
+
+# Remove LibreOffice
+sudo apt remove libreoffice-base-core libreoffice-impress libreoffice-calc libreoffice-math libreoffice-common \
+                libreoffice-ogltrans libreoffice-core libreoffice-pdfimport libreoffice-draw libreoffice-style-breeze \
+                libreoffice-gnome libreoffice-style-colibre libreoffice-gtk3 libreoffice-style-elementary \
+                libreoffice-help-common libreoffice-style-tango libreoffice-help-en-us libreoffice-writer
+sudo apt autoremove
+
+# Install OpenOffice
+wget https://sourceforge.net/projects/openofficeorg.mirror/files/4.1.12/binaries/en-US/Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
+tar -zxvf Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
+sudo rm -rf Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
+cd en-US/DEBS/
+sudo dpkg -i *.deb
+cd desktop-integration
+sudo dpkg -i *.deb
+cd ../../..
+sudo rm -rf en-US
 
 # Prepare python virtual env
 virtualenv -p /usr/bin/python3 .venv
 source .venv/bin/activate
-pip install -U wheel
-pip install -U setuptools
+pip install -U pip pipenv wheel setuptools
 pip install -U -r requirements.txt
 
 # Install additional python packages
 pip install -U --no-deps -e ../../lexpredict-contraxsuite-core/
 pip install -U -e ../text_extraction_system_api
 
 # NLTK should be installed within lexpredict-contraxsuite-core. The following downloads its models
-python3 -m nltk.downloader averaged_perceptron_tagger punkt stopwords words maxent_ne_chunker \
-                           wordnet omw-1.4
-
+python3 -m nltk.downloader averaged_perceptron_tagger punkt stopwords words maxent_ne_chunker wordnet omw-1.4
 deactivate
 
 # Downloading model for language detection

diff --git a/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py b/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py
@@ -1,30 +1,23 @@
 import logging
 import os
 import shutil
+import subprocess
 import tempfile
 from contextlib import contextmanager
-from subprocess import CompletedProcess
+from subprocess import CompletedProcess, PIPE
 from typing import Generator
 
 from PIL import Image
 
 from text_extraction_system.config import get_settings
-from text_extraction_system.locking.socket_lock import get_lock
-from text_extraction_system.pdf.errors import InputFileDoesNotExist, \
-    OutputPDFDoesNotExistAfterConversion
-from text_extraction_system.pdf.utils import separate_filename_basename_and_extension, \
-    run_process, prepare_large_data_file
+from text_extraction_system.pdf.errors import InputFileDoesNotExist, OutputPDFDoesNotExistAfterConversion
+from text_extraction_system.pdf.utils import separate_filename_basename_and_extension, run_process
 from text_extraction_system.processes import raise_from_process, render_process_msg
 
 log = logging.getLogger(__name__)
 
-SOFFICE_CALL_BASE_ARGUMENTS = ['--headless', '--invisible', '--nodefault', '--view',
-                               '--nolockcheck', '--nologo', '--norestore', '--nofirststartwizard', ]
 
-
-def convert_image_to_pdf(src_fn: str,
-                         out_fn: str,
-                         timeout_sec: int = 1800) -> CompletedProcess:
+def convert_image_to_pdf(src_fn: str, out_fn: str, timeout_sec: int = 1800) -> CompletedProcess:
     """
     Converts image to pdf file using custom Java solution
     """
@@ -33,33 +26,8 @@ def convert_image_to_pdf(src_fn: str,
     return run_process(args, timeout_sec)
 
 
-def soffice_convert_to_pdf(src_fn: str,
-                           directory: str,
-                           soffice_single_process_locking: bool = True,
-                           timeout_sec: int = 1800) -> CompletedProcess:
-    """
-    Converts image to pdf file using custom Java solution
-    """
-    with prepare_large_data_file(src_fn, directory) as prepared_fn:
-        args = ['soffice', *SOFFICE_CALL_BASE_ARGUMENTS,
-                '--convert-to', 'pdf',
-                prepared_fn,
-                '--outdir', directory]
-
-        # Soffice does not allow running multiple copies of the process in environment.
-        # The following is a workaround mostly for in-container usage.
-        if soffice_single_process_locking:
-            with get_lock('soffice_single_process',
-                          wait_required_listener=lambda: log.info(
-                              'Waiting for another conversion task to finish first...')):
-                return run_process(args, timeout_sec)
-        return run_process(args, timeout_sec)
-
-
 @contextmanager
-def convert_to_pdf(src_fn: str,
-                   soffice_single_process_locking: bool = True,
-                   timeout_sec: int = 1800) -> Generator[str, None, None]:
+def convert_to_pdf(src_fn: str, timeout_sec: int = 1800) -> Generator[str, None, None]:
     """
     Converts the specified file to pdf file.
     Soffice converter allows specifying the output directory. The output file name is generated
@@ -69,7 +37,6 @@ def convert_to_pdf(src_fn: str,
     """
     if not os.path.isfile(src_fn) and not os.path.isfile(src_fn):
         raise InputFileDoesNotExist(src_fn)
-
     temp_dir = tempfile.mkdtemp()
     source_fn = src_fn
     src_fn, src_fn_base, src_ext = separate_filename_basename_and_extension(src_fn, temp_dir)
@@ -78,7 +45,6 @@ def convert_to_pdf(src_fn: str,
     # Bypass pdf file
     if src_ext == 'pdf':
         return src_fn
-
     try:
         if src_ext.lower() in {'.tiff', '.jpg', '.jpeg', '.png'}:
             if src_ext.lower() == '.png':
@@ -87,14 +53,16 @@ def convert_to_pdf(src_fn: str,
                 rgb_im.save(source_fn)
             completed_process = convert_image_to_pdf(src_fn, out_fn, timeout_sec)
         else:
-            completed_process = soffice_convert_to_pdf(src_fn, temp_dir,
-                                                       soffice_single_process_locking, timeout_sec)
+            java_modules_path = get_settings().java_modules_path
+            args = ['java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.ConvertToPDF',
+                    '--original-doc', src_fn,
+                    '--dst-pdf', out_fn]
+            completed_process: CompletedProcess = subprocess.run(args, check=False, timeout=timeout_sec,
+                                                                 universal_newlines=True, stderr=PIPE, stdout=PIPE)
         raise_from_process(log, completed_process, lambda: f'Converting {src_fn} to pdf.')
-
         if not os.path.isfile(out_fn):
-            raise OutputPDFDoesNotExistAfterConversion(
-                f'Unable to convert {src_fn} to pdf. Output file does not exist after conversion.\n'
-                + render_process_msg(completed_process))
+            raise OutputPDFDoesNotExistAfterConversion(f'Unable to convert {src_fn} to pdf. Output file does not exist '
+                                                       f'after conversion.\n' + render_process_msg(completed_process))
         yield out_fn
     finally:
         if os.path.isdir(temp_dir):

diff --git a/text_extraction_system/text_extraction_system_java/pom.xml b/text_extraction_system/text_extraction_system_java/pom.xml
@@ -3,7 +3,6 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
-
     <groupId>org.example</groupId>
     <artifactId>text_extraction_system_java</artifactId>
     <version>1.3</version>
@@ -38,7 +37,6 @@
             </plugin-->
         </plugins>
     </build>
-
     <dependencies>
         <dependency>
             <groupId>org.apache.pdfbox</groupId>
@@ -176,7 +174,20 @@
             <artifactId>commons-cli</artifactId>
             <version>1.4</version>
         </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>1.6.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-simple</artifactId>
+            <version>1.6.1</version>
+        </dependency>
+        <dependency>
+            <groupId>com.artofsolving</groupId>
+            <artifactId>jodconverter</artifactId>
+            <version>2.2.1</version>
+        </dependency>
     </dependencies>
-
-
 </project>
diff --git a/...text_extraction_system_java/src/main/java/com/lexpredict/textextraction/ConvertToPDF.java b/...text_extraction_system_java/src/main/java/com/lexpredict/textextraction/ConvertToPDF.java
@@ -0,0 +1,61 @@
+package com.lexpredict.textextraction;
+
+import org.apache.commons.cli.*;
+
+import com.artofsolving.jodconverter.DocumentConverter;
+import com.artofsolving.jodconverter.DocumentFamily;
+import com.artofsolving.jodconverter.DocumentFormat;
+import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
+import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
+import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.ConnectException;
+
+
+public class ConvertToPDF {
+    public static void main(String[] args) throws IOException, ConnectException {
+        CommandLine cmd = parseCliArgs(args);
+        String src = cmd.getOptionValue("original-doc");
+        String dstPdf = cmd.getOptionValue("dst-pdf");
+
+        File inputFile = new File(src);
+        File outputFile = new File(dstPdf);
+
+        // connect to an OpenOffice.org instance running on port 8100
+        OpenOfficeConnection connection = new SocketOpenOfficeConnection(8100);
+        connection.connect();
+
+        // convert
+        DocumentConverter converter = new OpenOfficeDocumentConverter(connection);
+        final DocumentFormat docx = new DocumentFormat("Microsoft Word 2007 XML", DocumentFamily.TEXT, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx");
+        converter.convert(inputFile, docx, outputFile, null);
+
+        // close the connection
+        connection.disconnect();
+    }
+
+    protected static CommandLine parseCliArgs(String[] args) {
+        Options options = new Options();
+
+        Option originalPDF = new Option("orig", "original-doc", true, "Original document file to convert to PDF.");
+        originalPDF.setRequired(true);
+        options.addOption(originalPDF);
+
+        Option dstPDF = new Option("dst", "dst-pdf", true, "File name to save the resulting PDF.");
+        dstPDF.setRequired(true);
+        options.addOption(dstPDF);
+
+        CommandLineParser parser = new DefaultParser();
+        HelpFormatter formatter = new HelpFormatter();
+        try {
+            return parser.parse(options, args);
+        } catch (ParseException e) {
+            System.out.println(e.getMessage());
+            formatter.printHelp(ConvertToPDF.class.getName(), options);
+            System.exit(1);
+        }
+        return null;
+    }
+}