Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ docker stack deploy --compose-file ./temp/${DOCKER_COMPOSE_FILE} ${TEXT_EXTRACTI
rm -rf ./temp
echo "Deploy routines have been completed"

echo "Running OpenOffice server.."
soffice -headless -accept="socket,host=localhost,port=8100;urp;" -nofirststartwizard
30 changes: 22 additions & 8 deletions text_extraction_system/prepare_dev_env_ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,39 @@ OS_VERSION=$(awk -F= '/^VERSION_ID/{print $2}' /etc/os-release)
PYTHON3_VERSION="$(python3 -V 2>&1)"

# Install python, office, maven and tesseract
sudo apt-get install virtualenv python3-dev libreoffice maven tesseract-ocr tesseract-ocr-eng \
tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu \
tesseract-ocr-rus
sudo apt-get install virtualenv python3-dev maven tesseract-ocr tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra \
tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-rus

# Remove LibreOffice
sudo apt remove libreoffice-base-core libreoffice-impress libreoffice-calc libreoffice-math libreoffice-common \
libreoffice-ogltrans libreoffice-core libreoffice-pdfimport libreoffice-draw libreoffice-style-breeze \
libreoffice-gnome libreoffice-style-colibre libreoffice-gtk3 libreoffice-style-elementary \
libreoffice-help-common libreoffice-style-tango libreoffice-help-en-us libreoffice-writer
sudo apt autoremove

# Install OpenOffice
wget https://sourceforge.net/projects/openofficeorg.mirror/files/4.1.12/binaries/en-US/Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
tar -zxvf Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
sudo rm -rf Apache_OpenOffice_4.1.12_Linux_x86-64_install-deb_en-US.tar.gz
cd en-US/DEBS/
sudo dpkg -i *.deb
cd desktop-integration
sudo dpkg -i *.deb
cd ../../..
sudo rm -rf en-US

# Prepare python virtual env
virtualenv -p /usr/bin/python3 .venv
source .venv/bin/activate
pip install -U wheel
pip install -U setuptools
pip install -U pip pipenv wheel setuptools
pip install -U -r requirements.txt

# Install additional python packages
pip install -U --no-deps -e ../../lexpredict-contraxsuite-core/
pip install -U -e ../text_extraction_system_api

# NLTK should be installed within lexpredict-contraxsuite-core. The following downloads its models
python3 -m nltk.downloader averaged_perceptron_tagger punkt stopwords words maxent_ne_chunker \
wordnet omw-1.4

python3 -m nltk.downloader averaged_perceptron_tagger punkt stopwords words maxent_ne_chunker wordnet omw-1.4
deactivate

# Downloading model for language detection
Expand Down
60 changes: 14 additions & 46 deletions text_extraction_system/text_extraction_system/pdf/convert_to_pdf.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,23 @@
import logging
import os
import shutil
import subprocess
import tempfile
from contextlib import contextmanager
from subprocess import CompletedProcess
from subprocess import CompletedProcess, PIPE
from typing import Generator

from PIL import Image

from text_extraction_system.config import get_settings
from text_extraction_system.locking.socket_lock import get_lock
from text_extraction_system.pdf.errors import InputFileDoesNotExist, \
OutputPDFDoesNotExistAfterConversion
from text_extraction_system.pdf.utils import separate_filename_basename_and_extension, \
run_process, prepare_large_data_file
from text_extraction_system.pdf.errors import InputFileDoesNotExist, OutputPDFDoesNotExistAfterConversion
from text_extraction_system.pdf.utils import separate_filename_basename_and_extension, run_process
from text_extraction_system.processes import raise_from_process, render_process_msg

log = logging.getLogger(__name__)

SOFFICE_CALL_BASE_ARGUMENTS = ['--headless', '--invisible', '--nodefault', '--view',
'--nolockcheck', '--nologo', '--norestore', '--nofirststartwizard', ]


def convert_image_to_pdf(src_fn: str,
out_fn: str,
timeout_sec: int = 1800) -> CompletedProcess:
def convert_image_to_pdf(src_fn: str, out_fn: str, timeout_sec: int = 1800) -> CompletedProcess:
"""
Converts image to pdf file using custom Java solution
"""
Expand All @@ -33,33 +26,8 @@ def convert_image_to_pdf(src_fn: str,
return run_process(args, timeout_sec)


def soffice_convert_to_pdf(src_fn: str,
directory: str,
soffice_single_process_locking: bool = True,
timeout_sec: int = 1800) -> CompletedProcess:
"""
Converts image to pdf file using custom Java solution
"""
with prepare_large_data_file(src_fn, directory) as prepared_fn:
args = ['soffice', *SOFFICE_CALL_BASE_ARGUMENTS,
'--convert-to', 'pdf',
prepared_fn,
'--outdir', directory]

# Soffice does not allow running multiple copies of the process in environment.
# The following is a workaround mostly for in-container usage.
if soffice_single_process_locking:
with get_lock('soffice_single_process',
wait_required_listener=lambda: log.info(
'Waiting for another conversion task to finish first...')):
return run_process(args, timeout_sec)
return run_process(args, timeout_sec)


@contextmanager
def convert_to_pdf(src_fn: str,
soffice_single_process_locking: bool = True,
timeout_sec: int = 1800) -> Generator[str, None, None]:
def convert_to_pdf(src_fn: str, timeout_sec: int = 1800) -> Generator[str, None, None]:
"""
Converts the specified file to pdf file.
Soffice converter allows specifying the output directory. The output file name is generated
Expand All @@ -69,7 +37,6 @@ def convert_to_pdf(src_fn: str,
"""
if not os.path.isfile(src_fn) and not os.path.isfile(src_fn):
raise InputFileDoesNotExist(src_fn)

temp_dir = tempfile.mkdtemp()
source_fn = src_fn
src_fn, src_fn_base, src_ext = separate_filename_basename_and_extension(src_fn, temp_dir)
Expand All @@ -78,7 +45,6 @@ def convert_to_pdf(src_fn: str,
# Bypass pdf file
if src_ext == 'pdf':
return src_fn

try:
if src_ext.lower() in {'.tiff', '.jpg', '.jpeg', '.png'}:
if src_ext.lower() == '.png':
Expand All @@ -87,14 +53,16 @@ def convert_to_pdf(src_fn: str,
rgb_im.save(source_fn)
completed_process = convert_image_to_pdf(src_fn, out_fn, timeout_sec)
else:
completed_process = soffice_convert_to_pdf(src_fn, temp_dir,
soffice_single_process_locking, timeout_sec)
java_modules_path = get_settings().java_modules_path
args = ['java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.ConvertToPDF',
'--original-doc', src_fn,
'--dst-pdf', out_fn]
completed_process: CompletedProcess = subprocess.run(args, check=False, timeout=timeout_sec,
universal_newlines=True, stderr=PIPE, stdout=PIPE)
raise_from_process(log, completed_process, lambda: f'Converting {src_fn} to pdf.')

if not os.path.isfile(out_fn):
raise OutputPDFDoesNotExistAfterConversion(
f'Unable to convert {src_fn} to pdf. Output file does not exist after conversion.\n'
+ render_process_msg(completed_process))
raise OutputPDFDoesNotExistAfterConversion(f'Unable to convert {src_fn} to pdf. Output file does not exist '
f'after conversion.\n' + render_process_msg(completed_process))
yield out_fn
finally:
if os.path.isdir(temp_dir):
Expand Down
19 changes: 15 additions & 4 deletions text_extraction_system/text_extraction_system_java/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.example</groupId>
<artifactId>text_extraction_system_java</artifactId>
<version>1.3</version>
Expand Down Expand Up @@ -38,7 +37,6 @@
</plugin-->
</plugins>
</build>

<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
Expand Down Expand Up @@ -176,7 +174,20 @@
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>com.artofsolving</groupId>
<artifactId>jodconverter</artifactId>
<version>2.2.1</version>
</dependency>
</dependencies>


</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package com.lexpredict.textextraction;

import org.apache.commons.cli.*;

import com.artofsolving.jodconverter.DocumentConverter;
import com.artofsolving.jodconverter.DocumentFamily;
import com.artofsolving.jodconverter.DocumentFormat;
import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;

import java.io.File;
import java.io.IOException;
import java.net.ConnectException;


public class ConvertToPDF {
public static void main(String[] args) throws IOException, ConnectException {
CommandLine cmd = parseCliArgs(args);
String src = cmd.getOptionValue("original-doc");
String dstPdf = cmd.getOptionValue("dst-pdf");

File inputFile = new File(src);
File outputFile = new File(dstPdf);

// connect to an OpenOffice.org instance running on port 8100
OpenOfficeConnection connection = new SocketOpenOfficeConnection(8100);
connection.connect();

// convert
DocumentConverter converter = new OpenOfficeDocumentConverter(connection);
final DocumentFormat docx = new DocumentFormat("Microsoft Word 2007 XML", DocumentFamily.TEXT, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx");
converter.convert(inputFile, docx, outputFile, null);

// close the connection
connection.disconnect();
}

protected static CommandLine parseCliArgs(String[] args) {
Options options = new Options();

Option originalPDF = new Option("orig", "original-doc", true, "Original document file to convert to PDF.");
originalPDF.setRequired(true);
options.addOption(originalPDF);

Option dstPDF = new Option("dst", "dst-pdf", true, "File name to save the resulting PDF.");
dstPDF.setRequired(true);
options.addOption(dstPDF);

CommandLineParser parser = new DefaultParser();
HelpFormatter formatter = new HelpFormatter();
try {
return parser.parse(options, args);
} catch (ParseException e) {
System.out.println(e.getMessage());
formatter.printHelp(ConvertToPDF.class.getName(), options);
System.exit(1);
}
return null;
}
}