1+ import logging
12from udapi .core .block import Block
23
34class Link2Cluster (Block ):
45 """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format."""
56
6- def __init__ (self , id_attr = 'external -id' , ante_attr = 'antecedent-id' , ** kwargs ):
7+ def __init__ (self , id_attr = 'proiel -id' , ante_attr = 'antecedent-proiel- id' , delete_orig_attrs = True , ** kwargs ):
78 super ().__init__ (** kwargs )
89 self .id_attr = id_attr
9- self .ante_id = ante_attr
10+ self .ante_attr = ante_attr
11+ self .delete_orig_attrs = delete_orig_attrs
1012
1113 def process_document (self , doc ):
1214 id2node = {}
@@ -18,20 +20,38 @@ def process_document(self, doc):
1820 ante_id = node .misc [self .ante_attr ]
1921 if ante_id != '' :
2022 links .append ([ante_id , this_id ])
23+ if self .delete_orig_attrs :
24+ for attr in (self .id_attr , self .ante_attr ):
25+ del node .misc [attr ]
26+
27+ for link in links :
28+ if link [0 ] not in id2node :
29+ logging .warning (f"{ link [0 ]} is referenced in { self .ante_attr } , but not in { self .id_attr } " )
30+ links = [link for link in links if link [0 ] in id2node ]
31+
32+ # nodeA < nodeB is a shortcut for nodeA.ord < nodeB.ord
33+ # but here we need to sort nodes from different sentences,
34+ # so we need to compare first the bundle number and then node.ord.
35+ sort_key = lambda node : (node .root .bundle .number , node .ord )
2136
2237 # sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this.
23- node_links = [sorted ([id2node [link [0 ]], id2node [link [1 ]]], reverse = True ) for link in links ]
38+ node_links = [sorted ([id2node [link [0 ]], id2node [link [1 ]]], reverse = True , key = sort_key ) for link in links ]
2439
25- # sort() makes sure the links are sorted by the "this" node (i.e. the anaphor, not the antecendent).
26- node_links .sort ()
40+ # Makes sure the links are sorted by this_node (i.e. the anaphor, not the antecendent).
41+ node_links .sort (key = lambda link : sort_key ( link [ 0 ]) )
2742
2843 # Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating
2944 # and we can prevent the need for merging two entities.
3045 for this_node , ante_node in node_links :
31- assert not this_node .mentions
32- if ante_node .mentions :
33- ante_node .entities [0 ].create_mention (head = this_node , words = [this_node ])
46+ assert not this_node .coref_mentions
47+ if ante_node .coref_mentions :
48+ ante_node .coref_entities [0 ].create_mention (head = this_node , words = [this_node ])
3449 else :
3550 entity = this_node .root .document .create_coref_entity ()
36- entity .create_mention (head = ante_node , words = [ante_node ])
37- entity .create_mention (head = this_node , words = [this_node ])
51+ m_ante = entity .create_mention (head = ante_node , words = [ante_node ])
52+ m_this = entity .create_mention (head = this_node , words = [this_node ])
53+ for node , mention in ((ante_node , m_ante ), (this_node , m_this )):
54+ if node .misc ['information-status' ]:
55+ mention .other ['infstat' ] = node .misc ['information-status' ]
56+ if self .delete_orig_attrs :
57+ del node .misc ['information-status' ]
0 commit comments