Store the CoNNLU features in a dedicated class. This class has a toString() which outputs in the format expected by the CoNLLU files. The big advantage of doing this is that it makes Semgrex recognize the features in the format of UD files

AngledLuffa · AngledLuffa · commit cb50801f8292 · 2023-11-03T12:48:51.000-07:00
diff --git a/src/edu/stanford/nlp/ling/CoreAnnotations.java b/src/edu/stanford/nlp/ling/CoreAnnotations.java
@@ -1,6 +1,7 @@
 package edu.stanford.nlp.ling;
 
 import edu.stanford.nlp.ie.util.RelationTriple;
+import edu.stanford.nlp.trees.ud.CoNLLUFeatures;
 import edu.stanford.nlp.util.*;
 
 import java.util.Calendar;
@@ -581,10 +582,10 @@ public Class<HashMap<String,String>> getType() {
   /**
    * CoNLL-U dep parsing - List of morphological features
    */
-  public static class CoNLLUFeats implements CoreAnnotation<TreeMap<String,String>> {
+  public static class CoNLLUFeats implements CoreAnnotation<CoNLLUFeatures> {
     @Override
-    public Class<TreeMap<String,String>> getType() {
-      return ErasureUtils.uncheckedCast(TreeMap.class);
+    public Class<CoNLLUFeatures> getType() {
+      return CoNLLUFeatures.class;
     }
   }
 
diff --git a/src/edu/stanford/nlp/ling/CoreLabel.java b/src/edu/stanford/nlp/ling/CoreLabel.java
@@ -6,7 +6,7 @@
 import java.util.TreeMap;
 import java.util.function.Consumer;
 
-import edu.stanford.nlp.trees.ud.CoNLLUUtils;
+import edu.stanford.nlp.trees.ud.CoNLLUFeatures;
 import edu.stanford.nlp.util.ArrayCoreMap;
 import edu.stanford.nlp.util.CoreMap;
 import edu.stanford.nlp.util.Generics;
@@ -201,7 +201,7 @@ private void initFromStrings(String[] keys, String[] values) {
           } else if (valueClass == Boolean.class) {
             this.set(coreKeyClass, Boolean.parseBoolean(values[i]));
           } else if (coreKeyClass == CoreAnnotations.CoNLLUFeats.class) {
-            this.set(coreKeyClass, CoNLLUUtils.parseFeatures(values[i]));
+            this.set(coreKeyClass, new CoNLLUFeatures(values[i]));
           } else {
             throw new UnsupportedOperationException("CORE: CoreLabel.initFromStrings: " +
                                                     "Can't handle " + valueClass + " (key " + key + ")");
@@ -254,7 +254,7 @@ private void initFromStrings(Class[] keys, String[] values) {
         } else if (valueClass == Boolean.class) {
           this.set(coreKeyClass, Boolean.parseBoolean(values[i]));
         } else if (coreKeyClass == CoreAnnotations.CoNLLUFeats.class) {
-          this.set(coreKeyClass, CoNLLUUtils.parseFeatures(values[i]));
+          this.set(coreKeyClass, new CoNLLUFeatures(values[i]));
         } else {
           throw new UnsupportedOperationException("CORE: CoreLabel.initFromStrings: " +
                                                   "Can't handle " + valueClass + " (key " + coreKeyClass + ")");
diff --git a/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializer.java b/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializer.java
@@ -25,6 +25,7 @@
 import edu.stanford.nlp.trees.LabeledScoredTreeNode;
 import edu.stanford.nlp.trees.Tree;
 import edu.stanford.nlp.trees.Trees;
+import edu.stanford.nlp.trees.ud.CoNLLUFeatures;
 import edu.stanford.nlp.util.*;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.ling.CoreAnnotations.*;
@@ -1453,7 +1454,7 @@ public CoreLabel fromProto(CoreNLPProtos.Token proto) {
     if (proto.hasSpan()) { word.set(SpanAnnotation.class, new IntPair(proto.getSpan().getBegin(), proto.getSpan().getEnd())); }
     if (proto.hasSentiment()) { word.set(SentimentCoreAnnotations.SentimentClass.class, proto.getSentiment()); }
     if (proto.hasQuotationIndex()) { word.set(QuotationIndexAnnotation.class, proto.getQuotationIndex()); }
-    if (proto.hasConllUFeatures()) { word.set(CoNLLUFeats.class, new TreeMap<>(fromProto(proto.getConllUFeatures()))); }
+    if (proto.hasConllUFeatures()) { word.set(CoNLLUFeats.class, new CoNLLUFeatures(fromProto(proto.getConllUFeatures()))); }
     if (proto.hasConllUMisc()) { word.set(CoNLLUMisc.class, proto.getConllUMisc()); }
     if (proto.hasCoarseTag()) { word.set(CoarseTagAnnotation.class, proto.getCoarseTag()); }
     if (proto.hasConllUTokenSpan()) { word.set(CoNLLUTokenSpanAnnotation.class, new IntPair(proto.getConllUTokenSpan().getBegin(), proto.getSpan().getEnd())); }
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/EditNode.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/EditNode.java
@@ -9,7 +9,7 @@
 import edu.stanford.nlp.ling.IndexedWord;
 import edu.stanford.nlp.semgraph.SemanticGraph;
 import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
-import edu.stanford.nlp.trees.ud.CoNLLUUtils;
+import edu.stanford.nlp.trees.ud.CoNLLUFeatures;
 
 /**
  * Edit an existing node to have new attributes.
@@ -34,7 +34,7 @@ public EditNode(String nodeName, Map<String, String> attributes, String updateMo
     this.nodeName = nodeName;
     this.attributes = new TreeMap<>(attributes);
     if (updateMorphoFeatures != null) {
-      this.updateMorphoFeatures = CoNLLUUtils.parseFeatures(updateMorphoFeatures);
+      this.updateMorphoFeatures = new CoNLLUFeatures(updateMorphoFeatures);
     } else {
       this.updateMorphoFeatures = Collections.emptyMap();
     }
@@ -63,7 +63,7 @@ public String toEditString() {
     if (this.updateMorphoFeatures.size() > 0) {
       buf.append(Ssurgeon.UPDATE_MORPHO_FEATURES);
       buf.append(" ");
-      buf.append(CoNLLUUtils.toFeatureString(this.updateMorphoFeatures));
+      buf.append(CoNLLUFeatures.toFeatureString(this.updateMorphoFeatures));
     }
 
     return buf.toString();
@@ -93,10 +93,10 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
     }
 
     for (String key : updateMorphoFeatures.keySet()) {
-      TreeMap<String, String> features = word.get(CoreAnnotations.CoNLLUFeats.class);
+      CoNLLUFeatures features = word.get(CoreAnnotations.CoNLLUFeats.class);
       if (features == null) {
         changed = true;
-        features = new TreeMap<>();
+        features = new CoNLLUFeatures();
         word.set(CoreAnnotations.CoNLLUFeats.class, features);
       }
 
diff --git a/src/edu/stanford/nlp/trees/ud/CoNLLUDocumentReader.java b/src/edu/stanford/nlp/trees/ud/CoNLLUDocumentReader.java
@@ -245,7 +245,7 @@ public IndexedWord apply(String line) {
 
 
         /* Parse features. */
-        TreeMap<String, String> features = CoNLLUUtils.parseFeatures(bits[5]);
+        CoNLLUFeatures features = new CoNLLUFeatures(bits[5]);
         word.set(CoreAnnotations.CoNLLUFeats.class, features);
 
 
@@ -268,7 +268,7 @@ public IndexedWord apply(String line) {
         word.setValue(bits[1]);
 
         /* Parse features. */
-        TreeMap<String, String> features = CoNLLUUtils.parseFeatures(bits[5]);
+        CoNLLUFeatures features = new CoNLLUFeatures(bits[5]);
         word.set(CoreAnnotations.CoNLLUFeats.class, features);
 
         /* Parse extra dependencies. */
diff --git a/src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java b/src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java
@@ -90,7 +90,7 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
 
             String additionalDepsString =  CoNLLUUtils.toExtraDepsString(enhancedDependencies);
             String word = token.word();
-            String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
+            String featuresString = CoNLLUFeatures.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
             String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_");
             String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_");
             String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_");
@@ -170,7 +170,7 @@ public String printPOSAnnotations(CoreMap sentence, boolean fakeDeps) {
           String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_");
           String lemma = token.getString(CoreAnnotations.LemmaAnnotation.class, "_");
           String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_");
-          String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
+          String featuresString = CoNLLUFeatures.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
           String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_");
           final String head;
           final String rel;
diff --git a/src/edu/stanford/nlp/trees/ud/CoNLLUFeatures.java b/src/edu/stanford/nlp/trees/ud/CoNLLUFeatures.java
@@ -0,0 +1,89 @@
+package edu.stanford.nlp.trees.ud;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * A subclass of TreeMap with a toString() that looks like a CoNLLUFeatures
+ * and a method for extracting the features from a CoNLLU string
+ * <br>
+ * This is a TreeMap so that the features are sorted by their key,
+ * which is necessary for the CoNLLU format
+ */
+public class CoNLLUFeatures extends TreeMap<String, String> {
+  /**
+   * Parses the value of the feature column in a CoNLL-U file
+   * and returns them in a HashMap with the feature names as keys
+   * and the feature values as values.
+   *
+   * @param featureString
+   * @return A {@code HashMap<String,String>} with the feature values.
+   */
+  public CoNLLUFeatures(String featureString) {
+    super();
+
+    if (!featureString.equals("_")) {
+      String[] featValPairs = featureString.split("\\|");
+      for (String p : featValPairs) {
+        String[] featValPair = p.split("=");
+        this.put(featValPair[0], featValPair[1]);
+      }
+    }
+  }
+
+  public CoNLLUFeatures(Map<String, String> features) {
+    super(features);
+  }
+
+  public CoNLLUFeatures() {
+    super();
+  }
+
+
+  public static class FeatureNameComparator implements Comparator<String> {
+    @Override
+    public int compare(String featureName1, String featureName2) {
+      return featureName1.toLowerCase().compareTo(featureName2.toLowerCase());
+    }
+  }
+
+  /**
+   * Converts the features to a feature string to be used
+   * in a CoNLL-U file.
+   *
+   * @return The feature string.
+   */
+  public static String toFeatureString(Map<String,String> features) {
+    StringBuilder sb = new StringBuilder();
+    boolean first = true;
+    if (features != null) {
+      List<String> sortedKeys = new ArrayList<>(features.keySet());
+      Collections.sort(sortedKeys, new FeatureNameComparator());
+      for (String key : sortedKeys) {
+        if (!first) {
+          sb.append("|");
+        } else {
+          first = false;
+        }
+        sb.append(key)
+          .append("=")
+          .append(features.get(key));
+        
+      }
+    }
+    /* Empty feature list. */
+    if (first) {
+      sb.append("_");
+    }
+
+    return sb.toString();
+  }
+
+  public String toString() {
+    return toFeatureString(this);
+  }
+}
diff --git a/src/edu/stanford/nlp/trees/ud/CoNLLUUtils.java b/src/edu/stanford/nlp/trees/ud/CoNLLUUtils.java
@@ -9,60 +9,6 @@
  */
 public class CoNLLUUtils {
 
-    /**
-     * Parses the value of the feature column in a CoNLL-U file
-     * and returns them in a HashMap with the feature names as keys
-     * and the feature values as values.
-     *
-     * @param featureString
-     * @return A {@code HashMap<String,String>} with the feature values.
-     */
-    public static TreeMap<String,String> parseFeatures(String featureString) {
-        TreeMap<String, String> features = new TreeMap<>();
-        if (! featureString.equals("_")) {
-            String[] featValPairs = featureString.split("\\|");
-            for (String p : featValPairs) {
-                String[] featValPair = p.split("=");
-                features.put(featValPair[0], featValPair[1]);
-            }
-        }
-        return features;
-    }
-
-    /**
-     * Converts a feature HashMap to a feature string to be used
-     * in a CoNLL-U file.
-     *
-     * @return The feature string.
-     */
-    public static String toFeatureString(Map<String,String> features) {
-        StringBuilder sb = new StringBuilder();
-        boolean first = true;
-        if (features != null) {
-            List<String> sortedKeys = new ArrayList<>(features.keySet());
-            Collections.sort(sortedKeys, new FeatureNameComparator());
-            for (String key : sortedKeys) {
-                if (!first) {
-                    sb.append("|");
-                } else {
-                    first = false;
-                }
-
-                sb.append(key)
-                        .append("=")
-                        .append(features.get(key));
-
-            }
-        }
-
-    /* Empty feature list. */
-        if (first) {
-            sb.append("_");
-        }
-
-        return sb.toString();
-    }
-
     /**
      * Parses the value of the extra dependencies column in a CoNLL-U file
      * and returns them in a HashMap with the governor indices as keys
@@ -118,14 +64,6 @@ public static String toExtraDepsString(HashMap<String,String> extraDeps) {
     }
 
 
-    public static class FeatureNameComparator implements Comparator<String> {
-
-        @Override
-        public int compare(String featureName1, String featureName2) {
-            return featureName1.toLowerCase().compareTo(featureName2.toLowerCase());
-        }
-    }
-
     public static class DepIndexComparator implements Comparator<String> {
 
         @Override
diff --git a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesFeatureAnnotator.java b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesFeatureAnnotator.java
@@ -53,8 +53,8 @@ public class UniversalDependenciesFeatureAnnotator  {
 
 
   private static final String FEATURE_MAP_FILE = "edu/stanford/nlp/models/ud/feature_map.txt";
-  private HashMap<String,TreeMap<String,String>> posFeatureMap;
-  private Map<String,TreeMap<String,String>> wordPosFeatureMap;
+  private HashMap<String,CoNLLUFeatures> posFeatureMap;
+  private Map<String,CoNLLUFeatures> wordPosFeatureMap;
 
   private final Morphology morphology = new Morphology();
 
@@ -78,9 +78,9 @@ private void loadFeatureMap() {
         if (parts.length < 3) continue;
 
         if (parts[0].equals("*")) {
-          posFeatureMap.put(parts[1], CoNLLUUtils.parseFeatures(parts[2]));
+          posFeatureMap.put(parts[1], new CoNLLUFeatures(parts[2]));
         } else {
-          wordPosFeatureMap.put(parts[0] + '_' + parts[1], CoNLLUUtils.parseFeatures(parts[2]));
+          wordPosFeatureMap.put(parts[0] + '_' + parts[1], new CoNLLUFeatures(parts[2]));
         }
       }
     } catch (IOException e) {
@@ -392,10 +392,10 @@ public void addFeatures(SemanticGraph sg, Tree tree, boolean addLemma, boolean a
       String posTag = word.get(CoreAnnotations.PartOfSpeechAnnotation.class);
       String token = word.get(CoreAnnotations.TextAnnotation.class);
       Integer index = word.get(CoreAnnotations.IndexAnnotation.class);
-      TreeMap<String, String> wordFeatures = word.get(CoreAnnotations.CoNLLUFeats.class);
+      CoNLLUFeatures wordFeatures = word.get(CoreAnnotations.CoNLLUFeats.class);
 
       if (wordFeatures == null) {
-        wordFeatures = new TreeMap<>();
+        wordFeatures = new CoNLLUFeatures();
         word.set(CoreAnnotations.CoNLLUFeats.class, wordFeatures);
       }