diff --git a/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh b/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh index 516ebac..29d1a69 100755 --- a/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh +++ b/text_extraction_system/docker/deploy/deploy-to-swarm-cluster.sh @@ -15,3 +15,5 @@ docker stack deploy --compose-file ./temp/${DOCKER_COMPOSE_FILE} ${TEXT_EXTRACTI rm -rf ./temp echo "Deploy routines have been completed" +echo "Running OpenOffice server.." +soffice -headless -accept="socket,host=localhost,port=8100;urp;" -nofirststartwizard diff --git a/text_extraction_system/prepare_dev_env_ubuntu.sh b/text_extraction_system/prepare_dev_env_ubuntu.sh index 76a88d8..8c34299 100755 --- a/text_extraction_system/prepare_dev_env_ubuntu.sh +++ b/text_extraction_system/prepare_dev_env_ubuntu.sh @@ -5,15 +5,31 @@ OS_VERSION=$(awk -F= '/^VERSION_ID/{print $2}' /etc/os-release) PYTHON3_VERSION="$(python3 -V 2>&1)" # Install python, office, maven and tesseract -sudo apt-get install virtualenv python3-dev libreoffice maven tesseract-ocr tesseract-ocr-eng \ - tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu \ - tesseract-ocr-rus +sudo apt-get install virtualenv python3-dev maven tesseract-ocr tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra \ + tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-rus + +# Remove LibreOffice +sudo apt remove libreoffice-base-core libreoffice-impress libreoffice-calc libreoffice-math libreoffice-common \ + libreoffice-ogltrans libreoffice-core libreoffice-pdfimport libreoffice-draw libreoffice-style-breeze \ + libreoffice-gnome libreoffice-style-colibre libreoffice-gtk3 libreoffice-style-elementary \ + libreoffice-help-common libreoffice-style-tango libreoffice-help-en-us libreoffice-writer +sudo apt autoremove + +# Install OpenOffice +wget https://sourceforge.net/projects/openofficeorg.mirror/files/4.1.12/binaries/en-US/Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz +tar -zxvf Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz +sudo rm -rf Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz +cd en-US/DEBS/ +sudo dpkg -i *.deb +cd desktop-integration +sudo dpkg -i *.deb +cd ../../.. +sudo rm -rf en-US # Prepare python virtual env virtualenv -p /usr/bin/python3 .venv source .venv/bin/activate -pip install -U wheel -pip install -U setuptools +pip install -U pip pipenv wheel setuptools pip install -U -r requirements.txt # Install additional python packages @@ -21,9 +37,7 @@ pip install -U --no-deps -e ../../lexpredict-contraxsuite-core/ pip install -U -e ../text_extraction_system_api # NLTK should be installed within lexpredict-contraxsuite-core. The following downloads its models -python3 -m nltk.downloader averaged_perceptron_tagger punkt stopwords words maxent_ne_chunker \ - wordnet omw-1.4 - +python3 -m nltk.downloader averaged_perceptron_tagger punkt stopwords words maxent_ne_chunker wordnet omw-1.4 deactivate # Downloading model for language detection diff --git a/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py b/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py index a1e83a0..406a60d 100644 --- a/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py +++ b/text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py @@ -1,30 +1,23 @@ import logging import os import shutil +import subprocess import tempfile from contextlib import contextmanager -from subprocess import CompletedProcess +from subprocess import CompletedProcess, PIPE from typing import Generator from PIL import Image from text_extraction_system.config import get_settings -from text_extraction_system.locking.socket_lock import get_lock -from text_extraction_system.pdf.errors import InputFileDoesNotExist, \ - OutputPDFDoesNotExistAfterConversion -from text_extraction_system.pdf.utils import separate_filename_basename_and_extension, \ - run_process, prepare_large_data_file +from text_extraction_system.pdf.errors import InputFileDoesNotExist, OutputPDFDoesNotExistAfterConversion +from text_extraction_system.pdf.utils import separate_filename_basename_and_extension, run_process from text_extraction_system.processes import raise_from_process, render_process_msg log = logging.getLogger(__name__) -SOFFICE_CALL_BASE_ARGUMENTS = ['--headless', '--invisible', '--nodefault', '--view', - '--nolockcheck', '--nologo', '--norestore', '--nofirststartwizard', ] - -def convert_image_to_pdf(src_fn: str, - out_fn: str, - timeout_sec: int = 1800) -> CompletedProcess: +def convert_image_to_pdf(src_fn: str, out_fn: str, timeout_sec: int = 1800) -> CompletedProcess: """ Converts image to pdf file using custom Java solution """ @@ -33,33 +26,8 @@ def convert_image_to_pdf(src_fn: str, return run_process(args, timeout_sec) -def soffice_convert_to_pdf(src_fn: str, - directory: str, - soffice_single_process_locking: bool = True, - timeout_sec: int = 1800) -> CompletedProcess: - """ - Converts image to pdf file using custom Java solution - """ - with prepare_large_data_file(src_fn, directory) as prepared_fn: - args = ['soffice', *SOFFICE_CALL_BASE_ARGUMENTS, - '--convert-to', 'pdf', - prepared_fn, - '--outdir', directory] - - # Soffice does not allow running multiple copies of the process in environment. - # The following is a workaround mostly for in-container usage. - if soffice_single_process_locking: - with get_lock('soffice_single_process', - wait_required_listener=lambda: log.info( - 'Waiting for another conversion task to finish first...')): - return run_process(args, timeout_sec) - return run_process(args, timeout_sec) - - @contextmanager -def convert_to_pdf(src_fn: str, - soffice_single_process_locking: bool = True, - timeout_sec: int = 1800) -> Generator[str, None, None]: +def convert_to_pdf(src_fn: str, timeout_sec: int = 1800) -> Generator[str, None, None]: """ Converts the specified file to pdf file. Soffice converter allows specifying the output directory. The output file name is generated @@ -69,7 +37,6 @@ def convert_to_pdf(src_fn: str, """ if not os.path.isfile(src_fn) and not os.path.isfile(src_fn): raise InputFileDoesNotExist(src_fn) - temp_dir = tempfile.mkdtemp() source_fn = src_fn src_fn, src_fn_base, src_ext = separate_filename_basename_and_extension(src_fn, temp_dir) @@ -78,7 +45,6 @@ def convert_to_pdf(src_fn: str, # Bypass pdf file if src_ext == 'pdf': return src_fn - try: if src_ext.lower() in {'.tiff', '.jpg', '.jpeg', '.png'}: if src_ext.lower() == '.png': @@ -87,14 +53,16 @@ def convert_to_pdf(src_fn: str, rgb_im.save(source_fn) completed_process = convert_image_to_pdf(src_fn, out_fn, timeout_sec) else: - completed_process = soffice_convert_to_pdf(src_fn, temp_dir, - soffice_single_process_locking, timeout_sec) + java_modules_path = get_settings().java_modules_path + args = ['java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.ConvertToPDF', + '--original-doc', src_fn, + '--dst-pdf', out_fn] + completed_process: CompletedProcess = subprocess.run(args, check=False, timeout=timeout_sec, + universal_newlines=True, stderr=PIPE, stdout=PIPE) raise_from_process(log, completed_process, lambda: f'Converting {src_fn} to pdf.') - if not os.path.isfile(out_fn): - raise OutputPDFDoesNotExistAfterConversion( - f'Unable to convert {src_fn} to pdf. Output file does not exist after conversion.\n' - + render_process_msg(completed_process)) + raise OutputPDFDoesNotExistAfterConversion(f'Unable to convert {src_fn} to pdf. Output file does not exist ' + f'after conversion.\n' + render_process_msg(completed_process)) yield out_fn finally: if os.path.isdir(temp_dir): diff --git a/text_extraction_system/text_extraction_system_java/pom.xml b/text_extraction_system/text_extraction_system_java/pom.xml index 652222e..af319f5 100644 --- a/text_extraction_system/text_extraction_system_java/pom.xml +++ b/text_extraction_system/text_extraction_system_java/pom.xml @@ -3,7 +3,6 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - org.example text_extraction_system_java 1.3 @@ -38,7 +37,6 @@ - org.apache.pdfbox @@ -176,7 +174,20 @@ commons-cli 1.4 + + org.slf4j + slf4j-api + 1.6.1 + + + org.slf4j + slf4j-simple + 1.6.1 + + + com.artofsolving + jodconverter + 2.2.1 + - - \ No newline at end of file diff --git a/text_extraction_system/text_extraction_system_java/src/main/java/com/lexpredict/textextraction/ConvertToPDF.java b/text_extraction_system/text_extraction_system_java/src/main/java/com/lexpredict/textextraction/ConvertToPDF.java new file mode 100644 index 0000000..90e95f1 --- /dev/null +++ b/text_extraction_system/text_extraction_system_java/src/main/java/com/lexpredict/textextraction/ConvertToPDF.java @@ -0,0 +1,61 @@ +package com.lexpredict.textextraction; + +import org.apache.commons.cli.*; + +import com.artofsolving.jodconverter.DocumentConverter; +import com.artofsolving.jodconverter.DocumentFamily; +import com.artofsolving.jodconverter.DocumentFormat; +import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection; +import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection; +import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter; + +import java.io.File; +import java.io.IOException; +import java.net.ConnectException; + + +public class ConvertToPDF { + public static void main(String[] args) throws IOException, ConnectException { + CommandLine cmd = parseCliArgs(args); + String src = cmd.getOptionValue("original-doc"); + String dstPdf = cmd.getOptionValue("dst-pdf"); + + File inputFile = new File(src); + File outputFile = new File(dstPdf); + + // connect to an OpenOffice.org instance running on port 8100 + OpenOfficeConnection connection = new SocketOpenOfficeConnection(8100); + connection.connect(); + + // convert + DocumentConverter converter = new OpenOfficeDocumentConverter(connection); + final DocumentFormat docx = new DocumentFormat("Microsoft Word 2007 XML", DocumentFamily.TEXT, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"); + converter.convert(inputFile, docx, outputFile, null); + + // close the connection + connection.disconnect(); + } + + protected static CommandLine parseCliArgs(String[] args) { + Options options = new Options(); + + Option originalPDF = new Option("orig", "original-doc", true, "Original document file to convert to PDF."); + originalPDF.setRequired(true); + options.addOption(originalPDF); + + Option dstPDF = new Option("dst", "dst-pdf", true, "File name to save the resulting PDF."); + dstPDF.setRequired(true); + options.addOption(dstPDF); + + CommandLineParser parser = new DefaultParser(); + HelpFormatter formatter = new HelpFormatter(); + try { + return parser.parse(options, args); + } catch (ParseException e) { + System.out.println(e.getMessage()); + formatter.printHelp(ConvertToPDF.class.getName(), options); + System.exit(1); + } + return null; + } +} \ No newline at end of file