@@ -50,22 +50,34 @@ def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwar
5050 Args:
5151 check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT.
5252 The default is false, which means that fixed punctuation is detected only
53- based on the form with the exception of single quote / apostrophe character,
54- which is frequently ambiguous, so UPOS=PUNCT is checked always.
55- copy_to_enhanced: for all PUNCT nodes, let the enhanced depencies be the same
56- as the basic dependencies.
53+ based on the form with the exception of single & double quote character,
54+ which is frequently ambiguous*, so UPOS=PUNCT is checked always.
55+ *) Single quote can be an apostrophe. Double quote as a NOUN can be the inch symbol.
56+ copy_to_enhanced: for all upos=PUNCT, let the enhanced depencies
57+ be the same as the basic dependencies.
5758 """
5859 super ().__init__ (** kwargs )
5960 self ._punct_type = None
6061 self .check_paired_punct_upos = check_paired_punct_upos
6162 self .copy_to_enhanced = copy_to_enhanced
6263
64+ def _is_punct (self , node ):
65+ if node .upos == 'PUNCT' :
66+ return True
67+ if self .check_paired_punct_upos :
68+ return False
69+ if node .form in "'\" " :
70+ return False
71+ if node .form in PAIRED_PUNCT or node .form in PAIRED_PUNCT .values ():
72+ return True
73+ return False
74+
6375 def process_tree (self , root ):
6476 # First, make sure no PUNCT has children.
6577 # This may introduce multiple subroots, which will be fixed later on
6678 # (preventing to temporarily create multiple subroots here would prevent fixing some errors).
6779 for node in root .descendants :
68- while node .parent . upos == 'PUNCT' :
80+ while self . _is_punct ( node .parent ) :
6981 node .parent = node .parent .parent
7082
7183 # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type.
@@ -77,7 +89,7 @@ def process_tree(self, root):
7789 self ._punct_type = [None ] * (1 + len (root .descendants ))
7890 for node in root .descendants :
7991 if self ._punct_type [node .ord ] != 'closing' :
80- closing_punct = PAIRED_PUNCT .get (node .form , None )
92+ closing_punct = PAIRED_PUNCT .get (node .form )
8193 if closing_punct is not None :
8294 self ._fix_paired_punct (root , node , closing_punct )
8395
@@ -99,6 +111,8 @@ def process_tree(self, root):
99111 # This may not hold if the original subroot was a paired punctuation, which was rehanged.
100112 if root .children [0 ].udeprel != 'root' :
101113 root .children [0 ].udeprel = 'root'
114+ if self .copy_to_enhanced :
115+ root .children [0 ].deps = [{'parent' : root , 'deprel' : 'root' }]
102116 for another_node in root .children [0 ].descendants :
103117 if another_node .udeprel == 'root' :
104118 another_node .udeprel = 'punct'
@@ -107,7 +121,7 @@ def process_tree(self, root):
107121 if self .copy_to_enhanced :
108122 for node in root .descendants :
109123 if node .upos == 'PUNCT' :
110- node .deps = [{'parent' : node .parent , 'deprel' : 'punct' }]
124+ node .deps = [{'parent' : node .parent , 'deprel' : node . deprel }]
111125
112126 def _fix_subord_punct (self , node ):
113127 # Dot used as the ordinal-number marker (in some languages) or abbreviation marker.
@@ -148,13 +162,13 @@ def _fix_subord_punct(self, node):
148162 if l_cand is None or l_cand .is_root ():
149163 l_cand , l_path = None , []
150164 else :
151- while (not l_cand .parent .is_root () and l_cand .parent . precedes ( node )
152- and not node . precedes ( l_cand .descendants (add_self = 1 )[- 1 ]) ):
165+ while (not l_cand .parent .is_root () and l_cand .parent < node
166+ and not node < l_cand .descendants (add_self = 1 )[- 1 ]):
153167 l_cand = l_cand .parent
154168 l_path .append (l_cand )
155169 if r_cand is not None :
156- while (not r_cand .parent .is_root () and node . precedes ( r_cand .parent )
157- and not r_cand .descendants (add_self = 1 )[0 ]. precedes ( node ) ):
170+ while (not r_cand .parent .is_root () and node < r_cand .parent
171+ and not r_cand .descendants (add_self = 1 )[0 ] < node ):
158172 r_cand = r_cand .parent
159173 r_path .append (r_cand )
160174
@@ -203,7 +217,7 @@ def _causes_gap(self, node):
203217
204218 def _fix_paired_punct (self , root , opening_node , closing_punct ):
205219 if (self .check_paired_punct_upos
206- or opening_node .form == "'" ) and opening_node .upos != 'PUNCT' :
220+ or opening_node .form in "'\" " ) and opening_node .upos != 'PUNCT' :
207221 return
208222 nested_level = 0
209223 for node in root .descendants [opening_node .ord :]:
@@ -226,8 +240,8 @@ def _fix_pair(self, root, opening_node, closing_node):
226240 if node == opening_node or node == closing_node :
227241 continue
228242 # If this is a node inside of the pair, is its parent outside?
229- if opening_node . precedes ( node ) and node . precedes ( closing_node ) :
230- if node .parent . precedes ( opening_node ) or closing_node . precedes ( node .parent ) :
243+ if node > opening_node and node < closing_node :
244+ if node .parent < opening_node or node .parent > closing_node :
231245 if node .upos == 'PUNCT' :
232246 punct_heads .append (node )
233247 else :
@@ -236,12 +250,11 @@ def _fix_pair(self, root, opening_node, closing_node):
236250 # they also must not cause non-projectivity of other relations. This could
237251 # happen if an outside node is attached to an inside node. To account for
238252 # this, mark the inside parent as a head, too.
239- else :
240- if opening_node .precedes (node .parent ) and node .parent .precedes (closing_node ):
241- if node .parent .upos == 'PUNCT' :
242- punct_heads .append (node .parent )
243- else :
244- heads .append (node .parent )
253+ elif node .parent > opening_node and node .parent < closing_node :
254+ if node .parent .upos == 'PUNCT' :
255+ punct_heads .append (node .parent )
256+ else :
257+ heads .append (node .parent )
245258
246259 # Punctuation should not have children, but if there is no other head candidate,
247260 # let's break this rule.
0 commit comments