22
33import argparse
44import fileinput
5+ import sys
6+ import unicodedata
57
68
79def parse_args ():
@@ -10,28 +12,69 @@ def parse_args():
1012 parser .add_argument ('files' , metavar = 'FILE' , nargs = '*' ,
1113 help = 'files to read, if empty, stdin is used' )
1214
15+ parser .add_argument ("--discard-punctuation" , action = 'store_true' ,
16+ help = 'if set, skip words that consist of puctuation chars' )
1317 parser .add_argument ('-p' , '--output-predicate' , action = 'store_true' ,
1418 help = 'if set, output predicates' )
1519 args = parser .parse_args ()
1620 return args
1721
1822
19- def load_synonyms (files , output_predicate ):
23+ def load_synonyms (files , output_predicate , discard_punctuation ):
2024 synonyms = {}
2125 with fileinput .input (files = files ) as input :
22- for line in input :
26+ for i , line in enumerate ( input ) :
2327 line = line .strip ()
2428 if line == "" :
2529 continue
2630 entry = line .split ("," )[0 :9 ]
27- if entry [2 ] == "2" or (not output_predicate and entry [1 ] == "2" ):
31+
32+ is_deleted = (entry [2 ] == "2" )
33+ is_predicate = (entry [1 ] == "2" )
34+ if is_deleted or (is_predicate and not output_predicate ):
35+ continue
36+ if (is_punctuation_word (entry [8 ]) and discard_punctuation ):
37+ print (f"skip punctuation entry { entry [8 ]} at line { i } " ,
38+ file = sys .stderr )
2839 continue
40+
2941 group = synonyms .setdefault (entry [0 ], [[], []])
3042 group [1 if entry [2 ] == "1" else 0 ].append (entry [8 ])
3143
3244 return synonyms
3345
3446
47+ # Unicode General Category list, that is used for punctuation in elasticsearch_sudachi
48+ # see: com.worksap.nlp.lucene.sudachi.ja.util.Strings
49+ punctuation_categories = [
50+ "Zs" , # Character.SPACE_SEPARATOR
51+ "Zl" , # Character.LINE_SEPARATOR
52+ "Zp" , # Character.PARAGRAPH_SEPARATOR
53+ "Cc" , # Character.CONTROL
54+ "Cf" , # Character.FORMAT
55+ "Pd" , # Character.DASH_PUNCTUATION
56+ "Ps" , # Character.START_PUNCTUATION
57+ "Pe" , # Character.END_PUNCTUATION
58+ "Pc" , # Character.CONNECTOR_PUNCTUATION
59+ "Po" , # Character.OTHER_PUNCTUATION
60+ "Sm" , # Character.MATH_SYMBOL
61+ "Sc" , # Character.CURRENCY_SYMBOL
62+ "Sk" , # Character.MODIFIER_SYMBOL
63+ "So" , # Character.OTHER_SYMBOL
64+ "Pi" , # Character.INITIAL_QUOTE_PUNCTUATION
65+ "Pf" , # Character.FINAL_QUOTE_PUNCTUATION
66+ ]
67+
68+
69+ def is_punctuation_word (word : str ):
70+ # return True if all characters are in punctuation categories.
71+ for c in word :
72+ category = unicodedata .category (c )
73+ if category not in punctuation_categories :
74+ return False
75+ return True
76+
77+
3578def dump_synonyms (synonyms , file = None ):
3679 for groupid in sorted (synonyms ):
3780 group = synonyms [groupid ]
@@ -48,7 +91,11 @@ def dump_synonyms(synonyms, file=None):
4891def main ():
4992 args = parse_args ()
5093
51- synonyms = load_synonyms (args .files , args .output_predicate )
94+ synonyms = load_synonyms (
95+ args .files ,
96+ args .output_predicate ,
97+ args .discard_punctuation ,
98+ )
5299 dump_synonyms (synonyms )
53100
54101
0 commit comments