Skip to content

Commit 3f75b7c

Browse files
add discard_punctuation option
1 parent be5c646 commit 3f75b7c

File tree

1 file changed

+51
-4
lines changed

1 file changed

+51
-4
lines changed

docs/ssyn2es.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import argparse
44
import fileinput
5+
import sys
6+
import unicodedata
57

68

79
def parse_args():
@@ -10,28 +12,69 @@ def parse_args():
1012
parser.add_argument('files', metavar='FILE', nargs='*',
1113
help='files to read, if empty, stdin is used')
1214

15+
parser.add_argument("--discard-punctuation", action='store_true',
16+
help='if set, skip words that consist of puctuation chars')
1317
parser.add_argument('-p', '--output-predicate', action='store_true',
1418
help='if set, output predicates')
1519
args = parser.parse_args()
1620
return args
1721

1822

19-
def load_synonyms(files, output_predicate):
23+
def load_synonyms(files, output_predicate, discard_punctuation):
2024
synonyms = {}
2125
with fileinput.input(files=files) as input:
22-
for line in input:
26+
for i, line in enumerate(input):
2327
line = line.strip()
2428
if line == "":
2529
continue
2630
entry = line.split(",")[0:9]
27-
if entry[2] == "2" or (not output_predicate and entry[1] == "2"):
31+
32+
is_deleted = (entry[2] == "2")
33+
is_predicate = (entry[1] == "2")
34+
if is_deleted or (is_predicate and not output_predicate):
35+
continue
36+
if (is_punctuation_word(entry[8]) and discard_punctuation):
37+
print(f"skip punctuation entry {entry[8]} at line {i}",
38+
file=sys.stderr)
2839
continue
40+
2941
group = synonyms.setdefault(entry[0], [[], []])
3042
group[1 if entry[2] == "1" else 0].append(entry[8])
3143

3244
return synonyms
3345

3446

47+
# Unicode General Category list, that is used for punctuation in elasticsearch_sudachi
48+
# see: com.worksap.nlp.lucene.sudachi.ja.util.Strings
49+
punctuation_categories = [
50+
"Zs", # Character.SPACE_SEPARATOR
51+
"Zl", # Character.LINE_SEPARATOR
52+
"Zp", # Character.PARAGRAPH_SEPARATOR
53+
"Cc", # Character.CONTROL
54+
"Cf", # Character.FORMAT
55+
"Pd", # Character.DASH_PUNCTUATION
56+
"Ps", # Character.START_PUNCTUATION
57+
"Pe", # Character.END_PUNCTUATION
58+
"Pc", # Character.CONNECTOR_PUNCTUATION
59+
"Po", # Character.OTHER_PUNCTUATION
60+
"Sm", # Character.MATH_SYMBOL
61+
"Sc", # Character.CURRENCY_SYMBOL
62+
"Sk", # Character.MODIFIER_SYMBOL
63+
"So", # Character.OTHER_SYMBOL
64+
"Pi", # Character.INITIAL_QUOTE_PUNCTUATION
65+
"Pf", # Character.FINAL_QUOTE_PUNCTUATION
66+
]
67+
68+
69+
def is_punctuation_word(word: str):
70+
# return True if all characters are in punctuation categories.
71+
for c in word:
72+
category = unicodedata.category(c)
73+
if category not in punctuation_categories:
74+
return False
75+
return True
76+
77+
3578
def dump_synonyms(synonyms, file=None):
3679
for groupid in sorted(synonyms):
3780
group = synonyms[groupid]
@@ -48,7 +91,11 @@ def dump_synonyms(synonyms, file=None):
4891
def main():
4992
args = parse_args()
5093

51-
synonyms = load_synonyms(args.files, args.output_predicate)
94+
synonyms = load_synonyms(
95+
args.files,
96+
args.output_predicate,
97+
args.discard_punctuation,
98+
)
5299
dump_synonyms(synonyms)
53100

54101

0 commit comments

Comments
 (0)