Skip to content

Commit cb50801

Browse files
committed
Store the CoNNLU features in a dedicated class. This class has a toString() which outputs in the format expected by the CoNLLU files. The big advantage of doing this is that it makes Semgrex recognize the features in the format of UD files
1 parent ec5d719 commit cb50801

File tree

9 files changed

+113
-84
lines changed

9 files changed

+113
-84
lines changed

src/edu/stanford/nlp/ling/CoreAnnotations.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package edu.stanford.nlp.ling;
22

33
import edu.stanford.nlp.ie.util.RelationTriple;
4+
import edu.stanford.nlp.trees.ud.CoNLLUFeatures;
45
import edu.stanford.nlp.util.*;
56

67
import java.util.Calendar;
@@ -581,10 +582,10 @@ public Class<HashMap<String,String>> getType() {
581582
/**
582583
* CoNLL-U dep parsing - List of morphological features
583584
*/
584-
public static class CoNLLUFeats implements CoreAnnotation<TreeMap<String,String>> {
585+
public static class CoNLLUFeats implements CoreAnnotation<CoNLLUFeatures> {
585586
@Override
586-
public Class<TreeMap<String,String>> getType() {
587-
return ErasureUtils.uncheckedCast(TreeMap.class);
587+
public Class<CoNLLUFeatures> getType() {
588+
return CoNLLUFeatures.class;
588589
}
589590
}
590591

src/edu/stanford/nlp/ling/CoreLabel.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import java.util.TreeMap;
77
import java.util.function.Consumer;
88

9-
import edu.stanford.nlp.trees.ud.CoNLLUUtils;
9+
import edu.stanford.nlp.trees.ud.CoNLLUFeatures;
1010
import edu.stanford.nlp.util.ArrayCoreMap;
1111
import edu.stanford.nlp.util.CoreMap;
1212
import edu.stanford.nlp.util.Generics;
@@ -201,7 +201,7 @@ private void initFromStrings(String[] keys, String[] values) {
201201
} else if (valueClass == Boolean.class) {
202202
this.set(coreKeyClass, Boolean.parseBoolean(values[i]));
203203
} else if (coreKeyClass == CoreAnnotations.CoNLLUFeats.class) {
204-
this.set(coreKeyClass, CoNLLUUtils.parseFeatures(values[i]));
204+
this.set(coreKeyClass, new CoNLLUFeatures(values[i]));
205205
} else {
206206
throw new UnsupportedOperationException("CORE: CoreLabel.initFromStrings: " +
207207
"Can't handle " + valueClass + " (key " + key + ")");
@@ -254,7 +254,7 @@ private void initFromStrings(Class[] keys, String[] values) {
254254
} else if (valueClass == Boolean.class) {
255255
this.set(coreKeyClass, Boolean.parseBoolean(values[i]));
256256
} else if (coreKeyClass == CoreAnnotations.CoNLLUFeats.class) {
257-
this.set(coreKeyClass, CoNLLUUtils.parseFeatures(values[i]));
257+
this.set(coreKeyClass, new CoNLLUFeatures(values[i]));
258258
} else {
259259
throw new UnsupportedOperationException("CORE: CoreLabel.initFromStrings: " +
260260
"Can't handle " + valueClass + " (key " + coreKeyClass + ")");

src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import edu.stanford.nlp.trees.LabeledScoredTreeNode;
2626
import edu.stanford.nlp.trees.Tree;
2727
import edu.stanford.nlp.trees.Trees;
28+
import edu.stanford.nlp.trees.ud.CoNLLUFeatures;
2829
import edu.stanford.nlp.util.*;
2930
import edu.stanford.nlp.ling.CoreAnnotations;
3031
import edu.stanford.nlp.ling.CoreAnnotations.*;
@@ -1453,7 +1454,7 @@ public CoreLabel fromProto(CoreNLPProtos.Token proto) {
14531454
if (proto.hasSpan()) { word.set(SpanAnnotation.class, new IntPair(proto.getSpan().getBegin(), proto.getSpan().getEnd())); }
14541455
if (proto.hasSentiment()) { word.set(SentimentCoreAnnotations.SentimentClass.class, proto.getSentiment()); }
14551456
if (proto.hasQuotationIndex()) { word.set(QuotationIndexAnnotation.class, proto.getQuotationIndex()); }
1456-
if (proto.hasConllUFeatures()) { word.set(CoNLLUFeats.class, new TreeMap<>(fromProto(proto.getConllUFeatures()))); }
1457+
if (proto.hasConllUFeatures()) { word.set(CoNLLUFeats.class, new CoNLLUFeatures(fromProto(proto.getConllUFeatures()))); }
14571458
if (proto.hasConllUMisc()) { word.set(CoNLLUMisc.class, proto.getConllUMisc()); }
14581459
if (proto.hasCoarseTag()) { word.set(CoarseTagAnnotation.class, proto.getCoarseTag()); }
14591460
if (proto.hasConllUTokenSpan()) { word.set(CoNLLUTokenSpanAnnotation.class, new IntPair(proto.getConllUTokenSpan().getBegin(), proto.getSpan().getEnd())); }

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/EditNode.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import edu.stanford.nlp.ling.IndexedWord;
1010
import edu.stanford.nlp.semgraph.SemanticGraph;
1111
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
12-
import edu.stanford.nlp.trees.ud.CoNLLUUtils;
12+
import edu.stanford.nlp.trees.ud.CoNLLUFeatures;
1313

1414
/**
1515
* Edit an existing node to have new attributes.
@@ -34,7 +34,7 @@ public EditNode(String nodeName, Map<String, String> attributes, String updateMo
3434
this.nodeName = nodeName;
3535
this.attributes = new TreeMap<>(attributes);
3636
if (updateMorphoFeatures != null) {
37-
this.updateMorphoFeatures = CoNLLUUtils.parseFeatures(updateMorphoFeatures);
37+
this.updateMorphoFeatures = new CoNLLUFeatures(updateMorphoFeatures);
3838
} else {
3939
this.updateMorphoFeatures = Collections.emptyMap();
4040
}
@@ -63,7 +63,7 @@ public String toEditString() {
6363
if (this.updateMorphoFeatures.size() > 0) {
6464
buf.append(Ssurgeon.UPDATE_MORPHO_FEATURES);
6565
buf.append(" ");
66-
buf.append(CoNLLUUtils.toFeatureString(this.updateMorphoFeatures));
66+
buf.append(CoNLLUFeatures.toFeatureString(this.updateMorphoFeatures));
6767
}
6868

6969
return buf.toString();
@@ -93,10 +93,10 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
9393
}
9494

9595
for (String key : updateMorphoFeatures.keySet()) {
96-
TreeMap<String, String> features = word.get(CoreAnnotations.CoNLLUFeats.class);
96+
CoNLLUFeatures features = word.get(CoreAnnotations.CoNLLUFeats.class);
9797
if (features == null) {
9898
changed = true;
99-
features = new TreeMap<>();
99+
features = new CoNLLUFeatures();
100100
word.set(CoreAnnotations.CoNLLUFeats.class, features);
101101
}
102102

src/edu/stanford/nlp/trees/ud/CoNLLUDocumentReader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ public IndexedWord apply(String line) {
245245

246246

247247
/* Parse features. */
248-
TreeMap<String, String> features = CoNLLUUtils.parseFeatures(bits[5]);
248+
CoNLLUFeatures features = new CoNLLUFeatures(bits[5]);
249249
word.set(CoreAnnotations.CoNLLUFeats.class, features);
250250

251251

@@ -268,7 +268,7 @@ public IndexedWord apply(String line) {
268268
word.setValue(bits[1]);
269269

270270
/* Parse features. */
271-
TreeMap<String, String> features = CoNLLUUtils.parseFeatures(bits[5]);
271+
CoNLLUFeatures features = new CoNLLUFeatures(bits[5]);
272272
word.set(CoreAnnotations.CoNLLUFeats.class, features);
273273

274274
/* Parse extra dependencies. */

src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
9090

9191
String additionalDepsString = CoNLLUUtils.toExtraDepsString(enhancedDependencies);
9292
String word = token.word();
93-
String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
93+
String featuresString = CoNLLUFeatures.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
9494
String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_");
9595
String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_");
9696
String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_");
@@ -170,7 +170,7 @@ public String printPOSAnnotations(CoreMap sentence, boolean fakeDeps) {
170170
String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_");
171171
String lemma = token.getString(CoreAnnotations.LemmaAnnotation.class, "_");
172172
String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_");
173-
String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
173+
String featuresString = CoNLLUFeatures.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
174174
String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_");
175175
final String head;
176176
final String rel;
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
package edu.stanford.nlp.trees.ud;
2+
3+
import java.util.ArrayList;
4+
import java.util.Collections;
5+
import java.util.Comparator;
6+
import java.util.List;
7+
import java.util.Map;
8+
import java.util.TreeMap;
9+
10+
/**
11+
* A subclass of TreeMap with a toString() that looks like a CoNLLUFeatures
12+
* and a method for extracting the features from a CoNLLU string
13+
* <br>
14+
* This is a TreeMap so that the features are sorted by their key,
15+
* which is necessary for the CoNLLU format
16+
*/
17+
public class CoNLLUFeatures extends TreeMap<String, String> {
18+
/**
19+
* Parses the value of the feature column in a CoNLL-U file
20+
* and returns them in a HashMap with the feature names as keys
21+
* and the feature values as values.
22+
*
23+
* @param featureString
24+
* @return A {@code HashMap<String,String>} with the feature values.
25+
*/
26+
public CoNLLUFeatures(String featureString) {
27+
super();
28+
29+
if (!featureString.equals("_")) {
30+
String[] featValPairs = featureString.split("\\|");
31+
for (String p : featValPairs) {
32+
String[] featValPair = p.split("=");
33+
this.put(featValPair[0], featValPair[1]);
34+
}
35+
}
36+
}
37+
38+
public CoNLLUFeatures(Map<String, String> features) {
39+
super(features);
40+
}
41+
42+
public CoNLLUFeatures() {
43+
super();
44+
}
45+
46+
47+
public static class FeatureNameComparator implements Comparator<String> {
48+
@Override
49+
public int compare(String featureName1, String featureName2) {
50+
return featureName1.toLowerCase().compareTo(featureName2.toLowerCase());
51+
}
52+
}
53+
54+
/**
55+
* Converts the features to a feature string to be used
56+
* in a CoNLL-U file.
57+
*
58+
* @return The feature string.
59+
*/
60+
public static String toFeatureString(Map<String,String> features) {
61+
StringBuilder sb = new StringBuilder();
62+
boolean first = true;
63+
if (features != null) {
64+
List<String> sortedKeys = new ArrayList<>(features.keySet());
65+
Collections.sort(sortedKeys, new FeatureNameComparator());
66+
for (String key : sortedKeys) {
67+
if (!first) {
68+
sb.append("|");
69+
} else {
70+
first = false;
71+
}
72+
sb.append(key)
73+
.append("=")
74+
.append(features.get(key));
75+
76+
}
77+
}
78+
/* Empty feature list. */
79+
if (first) {
80+
sb.append("_");
81+
}
82+
83+
return sb.toString();
84+
}
85+
86+
public String toString() {
87+
return toFeatureString(this);
88+
}
89+
}

src/edu/stanford/nlp/trees/ud/CoNLLUUtils.java

Lines changed: 0 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -9,60 +9,6 @@
99
*/
1010
public class CoNLLUUtils {
1111

12-
/**
13-
* Parses the value of the feature column in a CoNLL-U file
14-
* and returns them in a HashMap with the feature names as keys
15-
* and the feature values as values.
16-
*
17-
* @param featureString
18-
* @return A {@code HashMap<String,String>} with the feature values.
19-
*/
20-
public static TreeMap<String,String> parseFeatures(String featureString) {
21-
TreeMap<String, String> features = new TreeMap<>();
22-
if (! featureString.equals("_")) {
23-
String[] featValPairs = featureString.split("\\|");
24-
for (String p : featValPairs) {
25-
String[] featValPair = p.split("=");
26-
features.put(featValPair[0], featValPair[1]);
27-
}
28-
}
29-
return features;
30-
}
31-
32-
/**
33-
* Converts a feature HashMap to a feature string to be used
34-
* in a CoNLL-U file.
35-
*
36-
* @return The feature string.
37-
*/
38-
public static String toFeatureString(Map<String,String> features) {
39-
StringBuilder sb = new StringBuilder();
40-
boolean first = true;
41-
if (features != null) {
42-
List<String> sortedKeys = new ArrayList<>(features.keySet());
43-
Collections.sort(sortedKeys, new FeatureNameComparator());
44-
for (String key : sortedKeys) {
45-
if (!first) {
46-
sb.append("|");
47-
} else {
48-
first = false;
49-
}
50-
51-
sb.append(key)
52-
.append("=")
53-
.append(features.get(key));
54-
55-
}
56-
}
57-
58-
/* Empty feature list. */
59-
if (first) {
60-
sb.append("_");
61-
}
62-
63-
return sb.toString();
64-
}
65-
6612
/**
6713
* Parses the value of the extra dependencies column in a CoNLL-U file
6814
* and returns them in a HashMap with the governor indices as keys
@@ -118,14 +64,6 @@ public static String toExtraDepsString(HashMap<String,String> extraDeps) {
11864
}
11965

12066

121-
public static class FeatureNameComparator implements Comparator<String> {
122-
123-
@Override
124-
public int compare(String featureName1, String featureName2) {
125-
return featureName1.toLowerCase().compareTo(featureName2.toLowerCase());
126-
}
127-
}
128-
12967
public static class DepIndexComparator implements Comparator<String> {
13068

13169
@Override

src/edu/stanford/nlp/trees/ud/UniversalDependenciesFeatureAnnotator.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ public class UniversalDependenciesFeatureAnnotator {
5353

5454

5555
private static final String FEATURE_MAP_FILE = "edu/stanford/nlp/models/ud/feature_map.txt";
56-
private HashMap<String,TreeMap<String,String>> posFeatureMap;
57-
private Map<String,TreeMap<String,String>> wordPosFeatureMap;
56+
private HashMap<String,CoNLLUFeatures> posFeatureMap;
57+
private Map<String,CoNLLUFeatures> wordPosFeatureMap;
5858

5959
private final Morphology morphology = new Morphology();
6060

@@ -78,9 +78,9 @@ private void loadFeatureMap() {
7878
if (parts.length < 3) continue;
7979

8080
if (parts[0].equals("*")) {
81-
posFeatureMap.put(parts[1], CoNLLUUtils.parseFeatures(parts[2]));
81+
posFeatureMap.put(parts[1], new CoNLLUFeatures(parts[2]));
8282
} else {
83-
wordPosFeatureMap.put(parts[0] + '_' + parts[1], CoNLLUUtils.parseFeatures(parts[2]));
83+
wordPosFeatureMap.put(parts[0] + '_' + parts[1], new CoNLLUFeatures(parts[2]));
8484
}
8585
}
8686
} catch (IOException e) {
@@ -392,10 +392,10 @@ public void addFeatures(SemanticGraph sg, Tree tree, boolean addLemma, boolean a
392392
String posTag = word.get(CoreAnnotations.PartOfSpeechAnnotation.class);
393393
String token = word.get(CoreAnnotations.TextAnnotation.class);
394394
Integer index = word.get(CoreAnnotations.IndexAnnotation.class);
395-
TreeMap<String, String> wordFeatures = word.get(CoreAnnotations.CoNLLUFeats.class);
395+
CoNLLUFeatures wordFeatures = word.get(CoreAnnotations.CoNLLUFeats.class);
396396

397397
if (wordFeatures == null) {
398-
wordFeatures = new TreeMap<>();
398+
wordFeatures = new CoNLLUFeatures();
399399
word.set(CoreAnnotations.CoNLLUFeats.class, wordFeatures);
400400
}
401401

0 commit comments

Comments
 (0)