Skip to content

Commit 022328a

Browse files
committed
Only run sentence splitting once, even if the user has manually added a redundant sentence splitter
Fix a couple broken tests - the changed ssplit behavior makes the tests work differently Removing the SentencesAnnotation if a tokenizer is run twice will allow the sentence splitter to run again, although it should be pointed out that re-annotating a document is not a great idea in the first place
1 parent 9cc32f2 commit 022328a

File tree

4 files changed

+34
-13
lines changed

4 files changed

+34
-13
lines changed

itest/src/edu/stanford/nlp/pipeline/RequirementsCorrectSlowITest.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -119,27 +119,27 @@ private void testAnnotatorSequence(List<String> annotators) {
119119

120120
@Test
121121
public void testDefaultPipeline() {
122-
testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "lemma", "ner", "gender", "parse", "coref"));
122+
testAnnotatorSequence(Arrays.asList("tokenize", "pos", "lemma", "ner", "gender", "parse", "coref"));
123123
}
124124

125125
@Test
126126
public void testDepparsePipeline() {
127-
testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "depparse"));
127+
testAnnotatorSequence(Arrays.asList("tokenize", "pos", "depparse"));
128128
}
129129

130130
@Test
131131
public void testQuotePipeline() {
132-
testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","ner","depparse","coref","quote"));
132+
testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","ner","depparse","coref","quote"));
133133
}
134134

135-
@Test
136-
public void testTrueCasePipeline() {
137-
testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","truecase"));
135+
@Test
136+
public void testTrueCasePipeline() {
137+
testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","truecase"));
138138
}
139139

140140
@Test
141141
public void testOpenIEPipeline() {
142-
testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","depparse","natlog","openie"));
142+
testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","depparse","natlog","openie"));
143143
}
144144

145145
@Test

src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,10 @@ public void annotate(Annotation annotation) {
446446
throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
447447
}
448448

449+
// If the annotation was already processed before and already has
450+
// a SentenceAnnotation.class, recreating the tokenization
451+
// invalidates any existing sentence annotation
452+
annotation.remove(CoreAnnotations.SentencesAnnotation.class);
449453
if (this.cleanxmlAnnotator != null) {
450454
this.cleanxmlAnnotator.annotate(annotation);
451455
}

src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ public class WordsToSentencesAnnotator implements Annotator {
3535

3636
private final boolean countLineNumbers;
3737

38+
private boolean loggedExtraSplit = false;
39+
3840
public WordsToSentencesAnnotator() {
3941
this(false);
4042
}
@@ -177,10 +179,18 @@ public void annotate(Annotation annotation) {
177179
if (VERBOSE) {
178180
log.info("Sentence splitting ... " + annotation);
179181
}
180-
if ( ! annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
182+
if (!annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
181183
throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
182184
}
183185

186+
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
187+
if (!loggedExtraSplit) {
188+
log.error("Multiple WordsToSentencesAnnotator or other sentence splitters are operating on this document!");
189+
loggedExtraSplit = true;
190+
}
191+
return;
192+
}
193+
184194
// get text and tokens from the document
185195
String text = annotation.get(CoreAnnotations.TextAnnotation.class);
186196
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);

test/src/edu/stanford/nlp/pipeline/CleanXmlAnnotatorTest.java

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,21 @@ public class CleanXmlAnnotatorTest {
3737
@Before
3838
public void setUp() throws Exception {
3939
synchronized(CleanXmlAnnotatorTest.class) {
40+
// we create the TokenizerAnnotator without the ssplit so we can
41+
// manually control the pieces
42+
// another alternative would be to create TokenizerAnnotators
43+
// with the CleanXML as part of it
4044
if (ptbInvertible == null) {
41-
ptbInvertible =
42-
new TokenizerAnnotator(false, "en", "invertible,ptb3Escaping=true");
45+
Properties props = new Properties();
46+
props.setProperty("tokenize.language", "en");
47+
props.setProperty("tokenize.ssplit", "false");
48+
ptbInvertible = new TokenizerAnnotator(false, props, "invertible,ptb3Escaping=true");
4349
}
4450
if (ptbNotInvertible == null) {
45-
ptbNotInvertible =
46-
new TokenizerAnnotator(false, "en",
47-
"invertible=false,ptb3Escaping=true");
51+
Properties props = new Properties();
52+
props.setProperty("tokenize.language", "en");
53+
props.setProperty("tokenize.ssplit", "false");
54+
ptbNotInvertible = new TokenizerAnnotator(false, props, "invertible=false,ptb3Escaping=true");
4855
}
4956
if (cleanXmlAllTags == null) {
5057
cleanXmlAllTags = new CleanXmlAnnotator(".*", "", "", false);

0 commit comments

Comments
 (0)