Only run sentence splitting once, even if the user has manually added a redundant sentence splitter

AngledLuffa · AngledLuffa · commit 022328a74b59 · 2022-08-16T13:01:45.000-07:00
Fix a couple broken tests - the changed ssplit behavior makes the tests work differently

Removing the SentencesAnnotation if a tokenizer is run twice will allow the sentence splitter to run again, although it should be pointed out that re-annotating a document is not a great idea in the first place
diff --git a/itest/src/edu/stanford/nlp/pipeline/RequirementsCorrectSlowITest.java b/itest/src/edu/stanford/nlp/pipeline/RequirementsCorrectSlowITest.java
@@ -119,27 +119,27 @@ private void testAnnotatorSequence(List<String> annotators) {
 
   @Test
   public void testDefaultPipeline() {
-    testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "lemma", "ner", "gender", "parse", "coref"));
+    testAnnotatorSequence(Arrays.asList("tokenize", "pos", "lemma", "ner", "gender", "parse", "coref"));
   }
 
   @Test
   public void testDepparsePipeline() {
-    testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "depparse"));
+    testAnnotatorSequence(Arrays.asList("tokenize", "pos", "depparse"));
   }
 
   @Test
   public void testQuotePipeline() {
-    testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","ner","depparse","coref","quote"));
+    testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","ner","depparse","coref","quote"));
   }
 
-   @Test
-   public void testTrueCasePipeline() {
-     testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","truecase"));
+  @Test
+  public void testTrueCasePipeline() {
+    testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","truecase"));
    }
 
   @Test
   public void testOpenIEPipeline() {
-    testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","depparse","natlog","openie"));
+    testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","depparse","natlog","openie"));
   }
 
   @Test
diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -446,6 +446,10 @@ public void annotate(Annotation annotation) {
       throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
     }
 
+    // If the annotation was already processed before and already has
+    // a SentenceAnnotation.class, recreating the tokenization
+    // invalidates any existing sentence annotation
+    annotation.remove(CoreAnnotations.SentencesAnnotation.class);
     if (this.cleanxmlAnnotator != null) {
       this.cleanxmlAnnotator.annotate(annotation);
     }
diff --git a/src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java b/src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java
@@ -35,6 +35,8 @@ public class WordsToSentencesAnnotator implements Annotator  {
 
   private final boolean countLineNumbers;
 
+  private boolean loggedExtraSplit = false;
+
   public WordsToSentencesAnnotator() {
     this(false);
   }
@@ -177,10 +179,18 @@ public void annotate(Annotation annotation) {
     if (VERBOSE) {
       log.info("Sentence splitting ... " + annotation);
     }
-    if ( ! annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
+    if (!annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
       throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
     }
 
+    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
+      if (!loggedExtraSplit) {
+        log.error("Multiple WordsToSentencesAnnotator or other sentence splitters are operating on this document!");
+        loggedExtraSplit = true;
+      }
+      return;
+    }
+
     // get text and tokens from the document
     String text = annotation.get(CoreAnnotations.TextAnnotation.class);
     List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
diff --git a/test/src/edu/stanford/nlp/pipeline/CleanXmlAnnotatorTest.java b/test/src/edu/stanford/nlp/pipeline/CleanXmlAnnotatorTest.java
@@ -37,14 +37,21 @@ public class CleanXmlAnnotatorTest {
   @Before
   public void setUp() throws Exception {
     synchronized(CleanXmlAnnotatorTest.class) {
+      // we create the TokenizerAnnotator without the ssplit so we can
+      // manually control the pieces
+      // another alternative would be to create TokenizerAnnotators
+      // with the CleanXML as part of it
       if (ptbInvertible == null) {
-        ptbInvertible =
-          new TokenizerAnnotator(false, "en", "invertible,ptb3Escaping=true");
+        Properties props = new Properties();
+        props.setProperty("tokenize.language", "en");
+        props.setProperty("tokenize.ssplit", "false");
+        ptbInvertible = new TokenizerAnnotator(false, props, "invertible,ptb3Escaping=true");
       }
       if (ptbNotInvertible == null) {
-        ptbNotInvertible =
-          new TokenizerAnnotator(false, "en",
-                                    "invertible=false,ptb3Escaping=true");
+        Properties props = new Properties();
+        props.setProperty("tokenize.language", "en");
+        props.setProperty("tokenize.ssplit", "false");
+        ptbNotInvertible = new TokenizerAnnotator(false, props, "invertible=false,ptb3Escaping=true");
       }
       if (cleanXmlAllTags == null) {
         cleanXmlAllTags = new CleanXmlAnnotator(".*", "", "", false);

Original file line number	Diff line number	Diff line change
`@@ -119,27 +119,27 @@ private void testAnnotatorSequence(List<String> annotators) {`
`119`	`119`
`120`	`120`	`@Test`
`121`	`121`	`public void testDefaultPipeline() {`
`122`		`- testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "lemma", "ner", "gender", "parse", "coref"));`
	`122`	`+ testAnnotatorSequence(Arrays.asList("tokenize", "pos", "lemma", "ner", "gender", "parse", "coref"));`
`123`	`123`	`}`
`124`	`124`
`125`	`125`	`@Test`
`126`	`126`	`public void testDepparsePipeline() {`
`127`		`- testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "depparse"));`
	`127`	`+ testAnnotatorSequence(Arrays.asList("tokenize", "pos", "depparse"));`
`128`	`128`	`}`
`129`	`129`
`130`	`130`	`@Test`
`131`	`131`	`public void testQuotePipeline() {`
`132`		`- testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","ner","depparse","coref","quote"));`
	`132`	`+ testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","ner","depparse","coref","quote"));`
`133`	`133`	`}`
`134`	`134`
`135`		`- @Test`
`136`		`- public void testTrueCasePipeline() {`
`137`		`- testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","truecase"));`
	`135`	`+ @Test`
	`136`	`+ public void testTrueCasePipeline() {`
	`137`	`+ testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","truecase"));`
`138`	`138`	`}`
`139`	`139`
`140`	`140`	`@Test`
`141`	`141`	`public void testOpenIEPipeline() {`
`142`		`- testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","depparse","natlog","openie"));`
	`142`	`+ testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","depparse","natlog","openie"));`
`143`	`143`	`}`
`144`	`144`
`145`	`145`	`@Test`
Original file line number	Diff line number	Diff line change
`@@ -446,6 +446,10 @@ public void annotate(Annotation annotation) {`
`446`	`446`	`throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);`
`447`	`447`	`}`
`448`	`448`
	`449`	`+ // If the annotation was already processed before and already has`
	`450`	`+ // a SentenceAnnotation.class, recreating the tokenization`
	`451`	`+ // invalidates any existing sentence annotation`
	`452`	`+ annotation.remove(CoreAnnotations.SentencesAnnotation.class);`
`449`	`453`	`if (this.cleanxmlAnnotator != null) {`
`450`	`454`	`this.cleanxmlAnnotator.annotate(annotation);`
`451`	`455`	`}`