@@ -476,11 +476,6 @@ import edu.stanford.nlp.util.logging.Redwood;
476476 return - 1 ;
477477 }
478478
479- private Object getNext() {
480- String txt = yytext();
481- return getNext(txt, txt);
482- }
483-
484479 /* * Make the next token.
485480 * If the begin character offset exceeds what can be stored in 32 bits, it is
486481 * entered as Integer.MAX_VALUE and an error is logged.
@@ -509,10 +504,12 @@ import edu.stanford.nlp.util.logging.Redwood;
509504 }
510505 }
511506
507+ /*
512508 private void fixJFlex4SpaceAfterTokenBug() {
513509 // try to work around an apparent jflex bug where it
514510 // gets a space at the token end by getting
515511 // wrong the length of the trailing context.
512+ // cdm2022: This bug no longer seems to exist; tested on several megabytes of text
516513 while (yylength() > 0) {
517514 char last = yycharat(yylength()-1);
518515 if (last == ' ' || last == '\t' || (last >= '\n' && last <= '\r' || last == '\u0085')) {
@@ -523,9 +520,10 @@ import edu.stanford.nlp.util.logging.Redwood;
523520 }
524521 }
525522 }
523+ */
526524
527525 private Object processAcronym() {
528- fixJFlex4SpaceAfterTokenBug();
526+ // fixJFlex4SpaceAfterTokenBug();
529527 String s;
530528 if (yylength() == 2 ) { // "I.", etc. Treat as "I" + "."
531529 yypushback(1 ); // return a period next time;
@@ -543,7 +541,7 @@ import edu.stanford.nlp.util.logging.Redwood;
543541 }
544542
545543 private Object processAbbrev3() {
546- fixJFlex4SpaceAfterTokenBug();
544+ // fixJFlex4SpaceAfterTokenBug();
547545 String txt = yytext();
548546 if (DEBUG ) { logger. info(" Used {ABBREV3} to recognize " + txt); }
549547 return getNext(txt, txt);
@@ -595,6 +593,7 @@ DIGIT = [:digit:]|[\u07C0-\u07C9]
595593DATE = {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {2,4}| {DIGIT} {4} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2}
596594/* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
597595NUM = {DIGIT} *( [ .,\u066B\u066C ] {DIGIT} +)+| {DIGIT} +( [ .:,\u00AD\u066B\u066C\u2009\u202F ] {DIGIT} +)*
596+ LEADING_NUM = {DIGIT} +( [ .,\u066B\u066C ] {DIGIT} +)+
598597/* Now don't allow bracketed negative numbers! They have too many uses (e.g.,
599598 years or times in parentheses), and having them in tokens messes up
600599 treebank parsing.
@@ -623,10 +622,14 @@ SEP_SUFFIX = ({SEP_CURRENCY}|{SEP_UNITS}|{SEP_OTHER})
623622LETTER = ([:letter:]| {SPLET} | [ \u00AD\u200C\u200D\u2060\u0237 - \u024F\u02C2 - \u02C5\u02D2 - \u02DF\u02E5 - \u02FF\u0300 - \u036F\u0370 - \u037D\u0384\u0385\u03CF\u03F6\u03FC - \u03FF\u0483 - \u0487\u04CF\u04F6 - \u04FF\u0510 - \u0525\u055A - \u055F\u0591 - \u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0615 - \u061A\u063B - \u063F\u064B - \u065E\u0670\u06D6 - \u06EF\u06FA - \u06FF\u070F\u0711\u0730 - \u074F\u0750 - \u077F\u07A6 - \u07B1\u07CA - \u07F5\u07FA\u0900 - \u0903\u093C\u093E - \u094E\u0951 - \u0955\u0962 - \u0963\u0981 - \u0983\u09BC - \u09C4\u09C7\u09C8\u09CB - \u09CD\u09D7\u09E2\u09E3\u0A01 - \u0A03\u0A3C\u0A3E - \u0A4F\u0A81 - \u0A83\u0ABC - \u0ACF\u0B82\u0BBE - \u0BC2\u0BC6 - \u0BC8\u0BCA - \u0BCD\u0C01 - \u0C03\u0C3E - \u0C56\u0D3E - \u0D44\u0D46 - \u0D48\u0E30 - \u0E3A\u0E47 - \u0E4E\u0EB1 - \u0EBC\u0EC8 - \u0ECD ] )
624623/* Allow in the zero-width (non-)joiner characters. Allow in Modifier non-spacing (= separated accent chars) */
625624WORD = {LETTER} ( {LETTER} | {DIGIT} | [\p{Mn}\p{Mc}] )*( [ .!?] {LETTER} ( {LETTER} | {DIGIT} | [\p{Mn}\p{Mc}] )*)*
625+ /* VARIANT THAT CAN'T END IN A NUMBER. Seemed needed for use with trailing number context, though unclear why */
626+ WORD_LETTER = {LETTER} | {LETTER} ( {LETTER} | {DIGIT} | [\p{Mn}\p{Mc}] )*( [ .!?] {LETTER} ( {LETTER} | {DIGIT} | [\p{Mn}\p{Mc}] )*)* {LETTER}
626627/* THING: The $ was for things like New$;
627628 WAS: only keep hyphens with short one side like co-ed. But (old) treebank just allows hyphenated things as words!
628629 THING allows d'Avignon or NUMBER before HYPHEN and the same things after it. Only first number can be negative. */
629630THING = ( [ dDoOlL] {APOSETCETERA} [\p{Alpha}\p{Digit}] )?( [\p{Alpha}\p{Digit}] +| {NUMBER} )( {HYPHEN} ( [ dDoOlL] {APOSETCETERA} [\p{Alpha}\p{Digit}] )?( [\p{Alpha}\p{Digit}] +| {NUM} ))*
631+ /* variant with final letter for trailing context bug */
632+ THING_LETTER = ( [ dDoOlL] {APOSETCETERA} [\p{Alpha}\p{Digit}] )?( [\p{Alpha}\p{Digit}] +| {NUMBER} )( {HYPHEN} ( [ dDoOlL] {APOSETCETERA} [\p{Alpha}\p{Digit}] )?( [\p{Alpha}\p{Digit}] +| {NUM} ))*\p{Alpha}
630633THINGA = [ A- Z] +(( [ +&] | {SPAMP} ) [ A- Z] +)+
631634THING3 = [\p{Alpha}\p{Digit}] +( -[\p{Alpha}] +){0,2}( \\ ? \/ [\p{Alpha}\p{Digit}] +( -[\p{Alpha}] +){0,2}){1,2}
632635APOS = [ '\u0092\u2019 ´] | ' /* ASCII straight quote, single right curly quote in CP1252 (wrong) or Unicode or reversed quote or HTML SGML escape */
@@ -916,6 +919,16 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
916919 if (DEBUG ) { logger. info(" Used {DIGIT}/{SEP_SUFFIX} to recognize " + txt); }
917920 return getNext(txt, txt);
918921 }
922+ /* for WORD in front of decimal number of dotted number sequence, leave the latter alone. */
923+ /* Sometimes this is for currencies like RM = Malaysian currency, DM = Deutschmark, SK = Swedish Kroner, etc. */
924+ {WORD_LETTER} / {LEADING_NUM} { final String origTxt = yytext();
925+ String tok = LexerUtils . removeSoftHyphens(origTxt);
926+ if (americanize) {
927+ tok = Americanize . americanize(tok);
928+ }
929+ if (DEBUG ) { logger. info(" Used {WORD_LETTER} to recognize " + origTxt + " as " + tok); }
930+ return getNext(tok, origTxt);
931+ }
919932{WORD} { final String origTxt = yytext();
920933 String tok = LexerUtils . removeSoftHyphens(origTxt);
921934 if (americanize) {
@@ -982,11 +995,6 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
982995 if (DEBUG ) { logger. info(" Used {DATE} to recognize " + origTxt + " as " + txt); }
983996 return getNext(txt, origTxt);
984997 }
985- /* Malaysian currency */
986- RM/ {NUM} { String txt = yytext();
987- if (DEBUG ) { logger. info(" Used Malaysian currency to recognize " + txt); }
988- return getNext(txt, txt);
989- }
990998{NUMBER} { String txt = yytext();
991999 handleHyphenatedNumber(txt);
9921000 if (DEBUG ) { logger. info(" Used {NUMBER} to recognize " + yytext() + " as " + removeFromNumber(yytext())); }
@@ -1453,7 +1461,16 @@ RM/{NUM} { String txt = yytext();
14531461 " ; probablyLeft=" + false ); }
14541462 return getNext( norm, tok) ;
14551463 } */
1456- {THING} { breakByHyphensSlashes(yytext()); // this was causing fail of attempted to pushback too much!
1464+ {THING_LETTER} / {LEADING_NUM} {
1465+ breakByHyphensSlashes(yytext()); // this was causing fail of attempt to pushback too much!
1466+ String tok = yytext();
1467+ /* A THING can contain quote like O'Malley */
1468+ String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
1469+ if (DEBUG ) { logger. info(" Used {THING_LETTER} to recognize " + tok + " as " + norm +
1470+ " ; probablyLeft=" + false ); }
1471+ return getNext(norm, tok);
1472+ }
1473+ {THING} { breakByHyphensSlashes(yytext()); // this was causing fail of attempt to pushback too much!
14571474 String tok = yytext();
14581475 /* A THING can contain quote like O'Malley */
14591476 String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
@@ -1582,17 +1599,17 @@ RM/{NUM} { String txt = yytext();
15821599 this . seenUntokenizableCharacter = true ;
15831600 break ;
15841601 case NONE_KEEP :
1585- return getNext();
1602+ return getNext(str, str );
15861603 case FIRST_KEEP :
15871604 if ( ! this . seenUntokenizableCharacter) {
15881605 logger. warning(msg);
15891606 this . seenUntokenizableCharacter = true ;
15901607 }
1591- return getNext();
1608+ return getNext(str, str );
15921609 case ALL_KEEP :
15931610 logger. warning(msg);
15941611 this . seenUntokenizableCharacter = true ;
1595- return getNext();
1612+ return getNext(str, str );
15961613 }
15971614 }
15981615<<EOF>> { if (invertible) {
0 commit comments