@@ -377,6 +377,8 @@ import edu.stanford.nlp.util.logging.Redwood;
377377 * If an apparent negative number is generated from inside a hyphenated word
378378 * (e.g., for "11-20", we first tokenize "11" and then appear to have generated "-20"),
379379 * then tokenize the hyphen separately as a hyphen or dash.
380+ * <p >
381+ * Note that this method has side effects: it may push back characters.
380382 */
381383 private void handleHyphenatedNumber(String in) {
382384 // Strip dashes from hyphenated words
@@ -573,9 +575,9 @@ SENTEND2 = {SPACE}({SPACE}|[:uppercase:]|{SGML2})
573575
574576/* Note that JFlex doesn't support {2,} pattern form. Only {j,k}. */
575577DATE = {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {2,4}| {DIGIT} {4} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2}
576- /* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
577- NUM = {DIGIT} *( [ ., \u066B\u066C ] {DIGIT} +)+| {DIGIT} +( [ .:,\u00AD\u066B\u066C\u2009\u202F ] {DIGIT} +)*
578- LEADING_NUM = {DIGIT} +( [ .,\u066B\u066C ] {DIGIT} +)+
578+ /* Note that NUM also includes times like 12:55. One can start with a . or but not a : or , */
579+ NUM = {DIGIT} *( [ .\u066B ] {DIGIT} +)+| {DIGIT} +( [ .:,\u00AD\u066B\u066C\u2009\u202F ] {DIGIT} +)*
580+ LEADING_NUM = {DIGIT} +( [ .: ,\u066B\u066C ] {DIGIT} +)+
579581/* Now don't allow bracketed negative numbers! They have too many uses (e.g.,
580582 years or times in parentheses), and having them in tokens messes up treebank parsing.
581583 NUMBER = [\-+]?{NUM}|\({NUM}\) */
@@ -1002,10 +1004,11 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
10021004 if (DEBUG ) { logger. info(" Used {DATE} to recognize " + origTxt + " as " + txt); }
10031005 return getNext(txt, origTxt);
10041006 }
1005- {NUMBER} { String txt = yytext();
1006- handleHyphenatedNumber(txt);
1007- if (DEBUG ) { logger. info(" Used {NUMBER} to recognize " + yytext() + " as " + removeFromNumber(yytext())); }
1008- return getNext(removeFromNumber(yytext()), yytext());
1007+ {NUMBER} { handleHyphenatedNumber(yytext());
1008+ String origTxt = yytext();
1009+ String txt = removeFromNumber(origTxt);
1010+ if (DEBUG ) { logger. info(" Used {NUMBER} to recognize " + origTxt + " as " + txt); }
1011+ return getNext(txt, origTxt);
10091012 }
10101013{SUBSUPNUM} { String txt = yytext();
10111014 if (DEBUG ) { logger. info(" Used {SUBSUPNUM} to recognize " + txt); }
0 commit comments