@@ -9,7 +9,8 @@ class MarkDiff(Block):
99 """Mark differences between parallel trees."""
1010
1111 def __init__ (self , gold_zone , attributes = 'form,lemma,upos,xpos,deprel,feats,misc' ,
12- mark = 1 , mark_attr = "Mark" , add = False , print_stats = 0 , ignore_parent = False , ** kwargs ):
12+ mark = 1 , mark_attr = 'Mark' , add = False , print_stats = 0 , ignore_parent = False ,
13+ align = False , align_attr = 'Align' , ** kwargs ):
1314 """Create the Mark block object.
1415 Params:
1516 gold_zone: Which of the zones should be treated as gold?
@@ -18,10 +19,19 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc
1819 The tree topology, i.e. node parent is always considered.
1920 mark: What value should be used in `node.misc['Mark']` of the differing nodes?
2021 mark_attr: use this MISC attribute name instead of "Mark".
22+ Use mark_attr=0 to prevent marking diffs in MISC.
2123 add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block,
2224 so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block.
2325 print_stats: How many lines of statistics should be printed? -1 means all.
2426 ignore_parent: ignore differences in dependency parents
27+ align: store word alignment, possible values are False (no alignment stored, the default)
28+ "from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord,
29+ "from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and
30+ "both", i.e. both from-pred and from-gold.
31+ If only forms should be considered for inducing the word alignment,
32+ you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1".
33+ Only one-to-one alignment is supported.
34+ align_attr: use this MISC attribute name instead of "Align".
2535 """
2636 super ().__init__ (** kwargs )
2737 self .gold_zone = gold_zone
@@ -31,7 +41,11 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc
3141 self .add = add
3242 self .print_stats = print_stats
3343 self .ignore_parent = ignore_parent
44+ self .align = align
45+ self .align_attr = align_attr
3446 self .stats = collections .Counter ()
47+ if not mark_attr and not align and not print_stats :
48+ raise ValueError ('mark_attr=0 does not make sense without align or print_stats' )
3549
3650 def process_tree (self , tree ):
3751 gold_tree = tree .bundle .get_tree (self .gold_zone )
@@ -45,7 +59,7 @@ def process_tree(self, tree):
4559
4660 pred_nodes , gold_nodes = tree .descendants , gold_tree .descendants
4761 # Make sure both pred and gold trees are marked, even if one has just deleted nodes.
48- if len (pred_nodes ) != len (gold_nodes ):
62+ if len (pred_nodes ) != len (gold_nodes ) and self . mark_attr :
4963 tree .add_comment (f'{ self .mark_attr } = { self .mark } ' )
5064 gold_tree .add_comment (f'{ self .mark_attr } = { self .mark } ' )
5165 pred_tokens = ['_' .join (n .get_attrs (self .attrs )) for n in pred_nodes ]
@@ -59,18 +73,24 @@ def process_tree(self, tree):
5973 if edit in {'equal' , 'replace' }:
6074 for i in range (pred_lo , pred_hi ):
6175 alignment [i ] = i - pred_lo + gold_lo
76+ if self .align in ("both" , "from-pred" ):
77+ pred_nodes [i ].misc [self .align_attr ] = i - pred_lo + gold_lo + 1
78+ if self .align in ("both" , "from-gold" ):
79+ gold_nodes [i - pred_lo + gold_lo ].misc [self .align_attr ] = i + 1
6280
6381 for diff in diffs :
6482 edit , pred_lo , pred_hi , gold_lo , gold_hi = diff
6583 if edit == 'equal' :
6684 for p_node , g_node in zip (pred_nodes [pred_lo :pred_hi ], gold_nodes [gold_lo :gold_hi ]):
6785 if not self .ignore_parent and alignment .get (p_node .parent .ord - 1 ) != g_node .parent .ord - 1 :
68- p_node .misc [self .mark_attr ] = self .mark
69- g_node .misc [self .mark_attr ] = self .mark
7086 self .stats ['ONLY-PARENT-CHANGED' ] += 1
87+ if self .mark_attr :
88+ p_node .misc [self .mark_attr ] = self .mark
89+ g_node .misc [self .mark_attr ] = self .mark
7190 else :
72- for node in pred_nodes [pred_lo :pred_hi ] + gold_nodes [gold_lo :gold_hi ]:
73- node .misc [self .mark_attr ] = self .mark
91+ if self .mark_attr :
92+ for node in pred_nodes [pred_lo :pred_hi ] + gold_nodes [gold_lo :gold_hi ]:
93+ node .misc [self .mark_attr ] = self .mark
7494 if self .print_stats :
7595 if edit == 'replace' :
7696 # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED
0 commit comments