1313import edu .stanford .nlp .ling .CoreAnnotations ;
1414import edu .stanford .nlp .objectbank .DelimitRegExIterator ;
1515import edu .stanford .nlp .objectbank .IteratorFromReaderFactory ;
16- import java . util . function . Function ;
16+ import edu . stanford . nlp . util . ArrayUtils ;
1717import edu .stanford .nlp .util .StringUtils ;
18+ import java .util .function .Function ;
1819
1920
2021/**
@@ -34,6 +35,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co
3435 //map can be something like "word=0,tag=1,answer=2"
3536 @ SuppressWarnings ("rawtypes" )
3637 private Class [] map ; // = null;
38+ private int wordColumn = -1 ;
3739 private IteratorFromReaderFactory <List <CoreLabel >> factory ;
3840
3941// public void init(SeqClassifierFlags flags) {
@@ -51,6 +53,7 @@ public void init(SeqClassifierFlags flags) {
5153 public void init (String map ) {
5254 // this.flags = null;
5355 this .map = CoreLabel .parseStringKeys (StringUtils .mapStringToArray (map ));
56+ this .wordColumn = ArrayUtils .indexOf (this .map , CoreAnnotations .TextAnnotation .class );
5457 factory = DelimitRegExIterator .getFactory ("\n (?:\\ s*\n )+" , new ColumnDocParser ());
5558 }
5659
@@ -87,6 +90,13 @@ public List<CoreLabel> apply(String doc) {
8790 if (info .length == 1 ) {
8891 info = whitePattern .split (line );
8992 }
93+ // Trimming later rather than splitting on all whitespace
94+ // gives us the possibility of tokens with whitespace in them
95+ // although obviously not at the start or end...
96+ // doesn't slow the classifier down too much
97+ if (wordColumn >= 0 ) {
98+ info [wordColumn ] = info [wordColumn ].trim ();
99+ }
90100 CoreLabel wi ;
91101 try {
92102 wi = new CoreLabel (map , info );
0 commit comments