Skip to content

Commit 3658396

Browse files
authored
Unidecode optional (#38)
1 parent d6e956e commit 3658396

File tree

2 files changed

+18
-5
lines changed

2 files changed

+18
-5
lines changed

emm/preprocessing/functions.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,21 @@
1919

2020
from __future__ import annotations
2121

22+
import warnings
2223
from functools import partial
2324
from typing import Any, Callable
2425

2526
import cleanco
26-
from unidecode import unidecode
27+
28+
try:
29+
from unidecode import unidecode
30+
except ImportError:
31+
unidecode = None
32+
warnings.warn(
33+
"The 'unidecode' module is not installed. 'strip_accents_unicode' will default to an identity function. "
34+
"Install 'unidecode' to enable accent stripping functionality.",
35+
ImportWarning,
36+
)
2737

2838
from emm.preprocessing.abbreviation_util import abbreviations_to_words, legal_abbreviations_to_words
2939

@@ -50,7 +60,7 @@ def map_shorthands(name):
5060

5161
return {
5262
# Replace accented characters by their normalized representation, e.g. replace 'ä' with 'A\xa4'
53-
"strip_accents_unicode": F.run_custom_function(unidecode),
63+
"strip_accents_unicode": F.run_custom_function(unidecode if unidecode is not None else (lambda x: x)),
5464
# Replace all dash and underscore characters with a space characters
5565
"strip_hyphens": F.regex_replace(r"""[-_]""", " ", simple=True),
5666
# Replace all punctuation characters (e.g. '.', '-', '_', ''', ';') with spaces

pyproject.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@ dependencies = [
3131
"sparse-dot-topn>=1.1.1",
3232
"joblib",
3333
"pyarrow>=6.0.1", # seems to work with spark 3.1.2 - 3.3.1
34-
"requests",
35-
"unidecode"
34+
"requests"
3635
]
3736
dynamic = ["version"]
3837

@@ -53,10 +52,14 @@ dev = [
5352
"pandoc",
5453
"pympler"
5554
]
55+
preprocessing = [
56+
"unidecode"
57+
]
5658
test = [
5759
"pytest",
5860
"pytest-ordering",
59-
"virtualenv"
61+
"virtualenv",
62+
"unidecode"
6063
]
6164
test-cov = [
6265
"coverage",

0 commit comments

Comments
 (0)