@@ -266,7 +266,7 @@ import edu.stanford.nlp.util.logging.Redwood;
266266
267267
268268 /* * Turn on to find out how things were tokenized. */
269- private static final boolean DEBUG = false ;
269+ private static final boolean DEBUG = true ;
270270
271271 /* * A logger for this class */
272272 private static final Redwood . RedwoodChannels logger = Redwood . channels(PTBLexer . class);
@@ -571,13 +571,13 @@ import edu.stanford.nlp.util.logging.Redwood;
571571SENTEND1 = {SPACENL} ( {SPACENL} |[:uppercase:]| {SGML1} )
572572SENTEND2 = {SPACE} ( {SPACE} |[:uppercase:]| {SGML2} )
573573
574+ /* Note that JFlex doesn't support {2,} pattern form. Only {j,k}. */
574575DATE = {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {2,4}| {DIGIT} {4} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2}
575576/* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
576577NUM = {DIGIT} *( [ .,\u066B\u066C ] {DIGIT} +)+| {DIGIT} +( [ .:,\u00AD\u066B\u066C\u2009\u202F ] {DIGIT} +)*
577578LEADING_NUM = {DIGIT} +( [ .,\u066B\u066C ] {DIGIT} +)+
578579/* Now don't allow bracketed negative numbers! They have too many uses (e.g.,
579- years or times in parentheses), and having them in tokens messes up
580- treebank parsing.
580+ years or times in parentheses), and having them in tokens messes up treebank parsing.
581581 NUMBER = [\-+]?{NUM}|\({NUM}\) */
582582NUMBER = [ \- \u2212 +] ? {NUM}
583583SUBSUPNUM = [ \u207A\u207B\u208A\u208B ] ?( [ \u2070\u00B9\u00B2\u00B3\u2074 - \u2079 ] +| [ \u2080 - \u2089 ] +)
@@ -614,7 +614,7 @@ THING_LETTER = ([dDoOlL]{APOSETCETERA}[\p{Alpha}\p{Digit}])?([\p{Alpha}\p{Digit}
614614THINGA = [ A- Z] +(( [ +&] | {SPAMP} ) [ A- Z] +)+
615615THING3 = [\p{Alpha}\p{Digit}] +( -[\p{Alpha}] +){0,2}( \\ ? \/ [\p{Alpha}\p{Digit}] +( -[\p{Alpha}] +){0,2}){1,2}
616616APOS = [ '\u0092\u2019 ´] | ' /* ASCII straight quote, single right curly quote in CP1252 (wrong) or Unicode or reversed quote or HTML SGML escape */
617- /* Includes extra ones that may appear inside a word, rightly or wrongly */
617+ /* Includes extra ones that may appear inside a word, rightly or wrongly: ASCII backquote, CP1252 left curly quote, left curly quote, high upside down left curly quote */
618618APOSETCETERA = {APOS} | [ `\u0091\u2018\u201B ]
619619/* HTHING recognizes hyphenated words, including ones with various kinds of numbers in them. And with underscores. */
620620HTHING = [\p{Alpha}\p{Digit}][\p{Alpha}\p{Digit} .,\u00AD\u200C\u200D\u2060 ] *( [- _] ( [\p{Alpha}\p{Digit} \u00AD\u200C\u200D\u2060 ] +( \. {DIGIT} +)?| {ACRO2} \. ))+
@@ -639,21 +639,24 @@ HTHINGEXCEPTIONPREFIXED = (e|a|u|x|agro|ante|anti|arch|be|bi|bio|co|counter|cros
639639HTHINGEXCEPTIONSUFFIXED = ( [\p{Alpha}\p{Digit}][\p{Alpha}\p{Digit} .,\u00AD ] *)( -)( esque| ette| fest| fold| gate| itis| less| most| o-torium| rama| wise)( s| es| d| ed)?
640640HTHINGEXCEPTIONWHOLE = ( mm-hm| mm-mm| o-kay| uh-huh| uh-oh)( s| es| d| ed)?
641641
642- /* things like 'll and 'm */
643- REDAUX = {APOSETCETERA} ( [ msdMSD ] | re| ve| ll)
642+ /* things like 'll and 'm and 'em for them */
643+ REDAUX = {APOSETCETERA} ( m | s | d | re| ve| ll| em )
644644/* For things that will have n't on the end. They can't end in 'n' */
645645/* \u00AD is soft hyphen. \u2060 is word joiner */
646- SWORD = [\p{Alpha} \u00AD\u200C\u200D\u2060 ] * [ A- MO- Za- mo- z][ \u00AD\u200C\u200D\u2060 ] *
647- SREDAUX = n{APOSETCETERA} t
648- /* Tokens you want but already okay: C'mon 'n' '[2-9]0s '[eE]m 'till?
649- [Yy]'all 'Cause Shi'ite B'Gosh o'clock. Here now only need apostrophe
650- final words. */
651- /* Note that Jflex doesn't support {2,} form. Only {2,k}. */
652- /* [yY]' is for Y'know, y'all and I for I. So exclude from one letter first */
653- /* Rest are for French borrowings. n allows n'ts in "don'ts" */
654- /* Arguably, c'mon should be split to "c'm" + "on", but not yet. 'Twixt for betwixt */
655- APOWORD = {APOS} n{APOS} ?| [ lLdDjJ] {APOS} |( Dunkin| somethin| ol) {APOS} | {APOS} em| diff{APOSETCETERA} rent| [ A- HJ- XZn] {APOSETCETERA} [:letter:]{2}[:letter:]*| {APOS} [ 1- 9] 0s| [ 1- 9] 0{APOS} s| {APOS} till?|[:letter:][:letter:]* [ aáeiouhlpyAEIOUY] {APOSETCETERA} [ aeiíoulA- Z] [:letter:]*| {APOS} cause| cont{APOSETCETERA} d\. ?| nor{APOSETCETERA} easter| c{APOSETCETERA} mon| e{APOSETCETERA} er| s{APOSETCETERA} mores| ev{APOSETCETERA} ry| li{APOSETCETERA} l| nat{APOSETCETERA} l| ass{APOSETCETERA} t| 'twixt| O{APOSETCETERA} o
656- APOWORD2 = y{APOS}
646+ WORD_NOT = [\p{Alpha} \u00AD\u200C\u200D\u2060 ] * [ A- MO- Za- mo- z][ \u00AD\u200C\u200D\u2060 ] *
647+ REDAUX_NOT = n{APOSETCETERA} ts?
648+
649+ /* 2022 tokenizer change. We generally allow apostrophes (including curly ones) into words. This is much better for
650+ * Hebrew, Arabic, Star Trek and some Black American names, etc. We only separate off word forms with apostrophes
651+ * that are known common word shortenings or clitics.
652+ */
653+ /* Tokens you want: 'n' '[2-9]0s '[eE]m 'till? 'Cause Shi'ite B'Gosh o'clock 'Twixt
654+ Here now only need apostrophe initial or final words listed. */
655+ /* Single letters are for French borrowings. */
656+ /* Arguably, c'mon should be split to "c'm" + "on", but not yet. */
657+ APOWORD = {WORD} ( {APOSETCETERA}{WORD} )+|\p{Script=Latin} {APOSETCETERA} [ A- Z] \. ( [ A- Z] \. )+| {APOS} n{APOS} ?|( [ lLdDjJ] | Dunkin| somethin| ol) {APOS} | {APOS} ( em| till?| cause| twixt| [ 1- 9] 0s)| [ 1- 9] 0{APOS} s
658+ /* APOWORD2 is things we will strip at beginning of word: th' shortening "the" (Th'enchanting) and y' shortening "you" (y'know, y'all) */
659+ APOWORD2 = ( th| y) {APOS}
657660/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
658661FULLURL = ( ftp| svn| svn\+ ssh| http| https| mailto) :\/\/ [^ \t\n\f\r <>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + [^ \t\n\f\r <>|.!?¡¿,·;:&`\"\'\* \p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
659662LIKELYURL = (( www\. ( [^ \t\n\f\r `<>|.!?,\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + \. )+ [ a- zA- Z] {2,4})|(( [^ \t\n\f\r `<>|.!?,:\/ $\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + \. )+( com| net| org| edu)))( \/ [^ \t\n\f\r `<>|] + [^ \t\n\f\r `<>|.!?,;:&\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-] )?
@@ -769,26 +772,34 @@ INSENTP = [,;:\u3001\u0F0D]
769772QUOTES = {APOS} | [ `\u2018 - \u201F\u0082\u0084\u0091 - \u0094\u2039\u203A\u00AB\u00BB ] {1,2}
770773DBLQUOT = \" | "| [ `'\u0091\u0092\u2018\u2019 ] '
771774/* Cap'n for captain, c'est for french */
772- TBSPEC = -( RRB| LRB| RCB| LCB| RSB| LSB) -| C\. D\. s| pro-| anti-| S( &| &) P-500| S( &| &) Ls| Cap {APOS} n | c {APOS} est
775+ TBSPEC = -( RRB| LRB| RCB| LCB| RSB| LSB) -| C\. D\. s| pro-| anti-| S( &| &) P-500| S( &| &) Ls
773776SWEARING = f[- *][- c*] k( in[ g'] ?| e[ dr] )?| f[- *] ( in[ g'] ?| e[ dr] )|( bull| dip)? s[ h@][- \* #] t( ty| e| box| s)?| c[- *] nts?| p[- *] ss( e[ sd] | ing)?| c[- *] ck| b[- *] tch| t[- *] ts| tw[- *] ts?| cr[- *] p| d[- *] cks?| b[- *][- *s] t[- *] rds?| pr[- *] ck| d[- *] mn| bl[- *] {2,2} dy
774777TBSPEC2 = {APOS} [ 0- 9][ 0- 9]
775778BANGWORDS = ( E| Yahoo| Jeopardy) \!
776779BANGMAGAZINES = OK\!
777780
781+ /* Allows covid-19 variants and other similar things. Must filter out first p.500, No.17, etc. */
782+ CAP_NUM_REST = [ 0- 9] +( \. [ 0- 9] +)* [ A- Za- z] *
783+ CAP_NUM = [ A- Z] + \. ( A-Z]+ \. )? {CAP_NUM_REST}
784+
778785/* Smileys (based on Chris Potts' sentiment tutorial, but much more restricted set - e.g., no "8)", "do:" or "):", too ambiguous) and simple Asian smileys */
779786SMILEY = [ <>] ? [ :;=][ \- o\* '] ? [ \(\) DPdpO\\ {@\|\[\] ]
780787ASIANSMILEY = [ \^ x=~<>] \.\[\^ x =~ <>]| [ \-\^ x=~<>'] _[ \-\^ x=~<>'] | \( [ \-\^ x=~<>'][ _.] ? [ \-\^ x=~<>'] \) | \( [ \^ x=~<>'] -[ \^ x=~<>'`] \) | ¯\\ _\( ツ\) _\/ ¯
781788
782-
783789/* U+2200-U+2BFF has a lot of the various mathematical, etc. symbol ranges */
784790/* \uFF65 is Halfwidth katakana middle dot; \u30FB is Katakana middle dot */
785791/* Math and other symbols that stand alone: °²× ∀; \u33A1 is m^2 in one char! */
786792/* Tibetan tsheg or tsek (U+0F0B) goes between syllables; words aren't space separated, so it may be a word or syllable marker; it indicates a possible line-break point. Treat as separate symbol. */
787793MISCSYMBOL = [ +%&~\^ |\\ ¦\u00A7 ¨\u00A9\u00AC\u00AE ¯\u00B0 - \u00B3\u00B4 - \u00BA\u00D7\u00F7\u0387\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0600 - \u0603\u0606 - \u060A\u060C\u0614\u061B\u061E\u066A\u066D\u0703 - \u070D\u07F6\u07F7\u07F8\u0964\u0965\u0E4F\u0F0B\u1FBD\u2016\u2017\u2020 - \u2025\u2030 - \u2038\u203B\u203C\u2043\u203E - \u2042\u2044\u2053\u207A - \u207F\u208A - \u208E\u2100 - \u214F\u2190 - \u21FF\u2200 - \u2BFF\u3001 - \u3006\u3008 - \u3020\u30FB\u33A1\uFF01 - \uFF0F\uFF1A - \uFF20\uFF3B - \uFF40\uFF5B - \uFF65\uFF65 ]
788794
789795PROG_LANGS = c[ +][ +] |( c| f) #
796+
797+ ONECHAR_APOS = [ '\u0092\u2019 ´`\u0091\u2018\u201B ]
798+ /* Assimilations5 leave 5 chars behind after division */
799+ ASSIMILATIONS5 = {ONECHAR_APOS} tain{ONECHAR_APOS} t| t{ONECHAR_APOS} ain{ONECHAR_APOS} t
790800/* Assimilations3 leave 3 chars behind after division */
791801ASSIMILATIONS3 = cannot| 'twas| dunno| [ '’] d[ '’] ve
802+ /* Assimilations2 leave 2 chars behind after division */
792803/* "nno" is a remnant after pushing back from dunno in ASSIMILATIONS3 */
793804/* Include splitting some apostrophe-less negations, but not ones like "wont" that are also words. */
794805ASSIMILATIONS2 = {APOS} tis| gonna| gotta| lemme| gimme| wanna| nno| aint| dont| doesnt| didnt| theyre
@@ -806,6 +817,14 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
806817 if (DEBUG ) { logger. info(" Used {PROG_LANGS} to recognize " + tok); }
807818 return getNext(tok, tok);
808819 }
820+ {ASSIMILATIONS5} { if (splitAssimilations) {
821+ yypushback(5 );
822+ }
823+ String tok = yytext();
824+ if (DEBUG ) { logger. info(" Used {ASSIMILATIONS5} to recognize " + tok +
825+ " ; splitAssimilations=" + splitAssimilations); }
826+ return getNext(tok, tok);
827+ }
809828{ASSIMILATIONS3} { if (splitAssimilations) {
810829 yypushback(3 );
811830 }
@@ -860,6 +879,29 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
860879 if (DEBUG ) { logger. info(" Used {SPPUNC} to recognize " + tok); }
861880 return getNext(tok, tok);
862881 }
882+
883+ /* Allow for two {REDAUX} like I'd've or they'd've */
884+ {WORD} / {REDAUX}{REDAUX} { final String origTxt = yytext();
885+ String tok = LexerUtils . removeSoftHyphens(origTxt);
886+ if (americanize) {
887+ tok = Americanize . americanize(tok);
888+ }
889+ if (DEBUG ) { logger. info(" Used {WORD} (4) to recognize " + origTxt + " as " + tok); }
890+ return getNext(tok, origTxt);
891+ }
892+ {APOWORD} / {REDAUX}{REDAUX} { String tok = yytext();
893+ String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
894+ if (DEBUG ) { logger. info(" Used {APOWORD} (2) to recognize " + tok + " as " + norm +
895+ " ; quoteStyle=" + quoteStyle + " ; probablyLeft=" + false ); }
896+ return getNext(norm, tok);
897+ }
898+ {WORD_NOT} / {REDAUX_NOT}{REDAUX} { final String origTxt = yytext();
899+ String tok = LexerUtils . removeSoftHyphens(origTxt);
900+ if (DEBUG ) { logger. info(" Used {WORD_NOT} (2) to recognize " + origTxt + " as " + tok); }
901+ return getNext(tok, origTxt);
902+ }
903+
904+
863905{WORD} / {REDAUX} { final String origTxt = yytext();
864906 String tok = LexerUtils . removeSoftHyphens(origTxt);
865907 if (americanize) {
@@ -868,11 +910,21 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
868910 if (DEBUG ) { logger. info(" Used {WORD} to recognize " + origTxt + " as " + tok); }
869911 return getNext(tok, origTxt);
870912 }
871- {SWORD } / {SREDAUX} { final String origTxt = yytext();
913+ {WORD_NOT } / {REDAUX_NOT} { final String origTxt = yytext();
872914 String tok = LexerUtils . removeSoftHyphens(origTxt);
873- if (DEBUG ) { logger. info(" Used {SWORD } to recognize " + origTxt + " as " + tok); }
915+ if (DEBUG ) { logger. info(" Used {WORD_NOT } to recognize " + origTxt + " as " + tok); }
874916 return getNext(tok, origTxt);
875917 }
918+ {APOWORD} / {REDAUX} { String tok = yytext();
919+ String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
920+ if (DEBUG ) { logger. info(" Used {APOWORD} (2) to recognize " + tok + " as " + norm +
921+ " ; quoteStyle=" + quoteStyle + " ; probablyLeft=" + false ); }
922+ return getNext(norm, tok);
923+ }
924+ {APOWORD2} / {WORD} { String txt = yytext();
925+ if (DEBUG ) { logger. info(" Used {APOWORD2} to recognize " + txt); }
926+ return getNext(txt, txt);
927+ }
876928{DIGIT} +/ {SEP_SUFFIX} { String txt = yytext();
877929 if (DEBUG ) { logger. info(" Used {DIGIT}/{SEP_SUFFIX} to recognize " + txt); }
878930 return getNext(txt, txt);
@@ -897,14 +949,11 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
897949 }
898950{APOWORD} { String tok = yytext();
899951 String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
952+ norm = LexerUtils . removeSoftHyphens(norm);
900953 if (DEBUG ) { logger. info(" Used {APOWORD} to recognize " + tok + " as " + norm +
901- " ; probablyLeft=" + false ); }
954+ " ; quoteStyle= " + quoteStyle + " ; probablyLeft=" + false ); }
902955 return getNext(norm, tok);
903956 }
904- {APOWORD2} /[:letter:] { String txt = yytext();
905- if (DEBUG ) { logger. info(" Used {APOWORD2} to recognize " + txt); }
906- return getNext(txt, txt);
907- }
908957{FULLURL} { String txt = yytext();
909958 String norm = txt;
910959 if (escapeForwardSlashAsterisk) {
@@ -934,13 +983,13 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
934983{REDAUX} / [^\p{Latin} '’] { String tok = yytext();
935984 String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
936985 if (DEBUG ) { logger. info(" Used {REDAUX} to recognize " + tok + " as " + norm +
937- " ; probablyLeft=" + false ); }
986+ " ; quoteStyle= " + quoteStyle + " ; probablyLeft=" + false ); }
938987 return getNext(norm, tok);
939988 }
940- {SREDAUX } / [^\p{Latin} '’] { String tok = yytext();
989+ {REDAUX_NOT } / [^\p{Latin} '’] { String tok = yytext();
941990 String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
942- if (DEBUG ) { logger. info(" Used {SREDAUX } to recognize " + tok + " as " + norm +
943- " ; probablyLeft =" + false ); }
991+ if (DEBUG ) { logger. info(" Used {REDAUX_NOT } to recognize " + tok + " as " + norm +
992+ " ; quoteStyle =" + quoteStyle ); }
944993 return getNext(norm, tok);
945994 }
946995{DATE} { String origTxt = yytext();
@@ -1175,7 +1224,7 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
11751224 }
11761225{DBLQUOT} { String tok = yytext();
11771226 String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
1178- if (DEBUG ) { logger. info(" Used {SREDAUX } to recognize " + tok + " as " + norm +
1227+ if (DEBUG ) { logger. info(" Used {DBLQUOT } to recognize " + tok + " as " + norm +
11791228 " ; probablyLeft=" + false ); }
11801229 return getNext(norm, tok);
11811230 }
@@ -1185,6 +1234,18 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
11851234 if (DEBUG ) { logger. info(" Used {SMILEY} to recognize " + origText + " as " + txt); }
11861235 return getNext(txt, origText);
11871236 }
1237+
1238+ /* This rule doesn't seem to fire to block {CAP_NUM} when it could. I have no idea why. Ignoring for now as a rare case. */
1239+ {ABBREV3} / {CAP_NUM_REST} {
1240+ String txt = yytext();
1241+ if (DEBUG ) { logger. info(" Used {ABBREV3} (2) to recognize " + txt); }
1242+ return getNext(txt, txt);
1243+ }
1244+ {CAP_NUM} {
1245+ String txt = yytext();
1246+ if (DEBUG ) { logger. info(" Used {CAP_NUM} to recognize " + txt); }
1247+ return getNext(txt, txt);
1248+ }
11881249{ASIANSMILEY} { String txt = yytext();
11891250 String origText = txt;
11901251 txt = LexerUtils . pennNormalizeParens(txt, normalizeParentheses);
@@ -1457,7 +1518,7 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
14571518 " ; probablyLeft=" + false ); }
14581519 return getNext(norm, tok);
14591520 }
1460- /* This QUOTES must proceed (S) REDAUX (2) so it by preference matches straight quote before word.
1521+ /* This QUOTES must proceed REDAUX (2) so it by preference matches straight quote before word.
14611522 Trying to collapse the first two cases seemed to break things (?!?). */
14621523{QUOTES} /[:letter:] {NOT_SPACENL_ONE_CHAR}
14631524 { // Extra context is to not match on ones like 'd but you do want words like "a"
@@ -1485,17 +1546,12 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
14851546 " ; probablyLeft=" + false ); }
14861547 return getNext(norm, tok);
14871548 }
1488- /* These (S) REDAUX (2) cases are needed in case string ends on "it's". See: testJacobEisensteinApostropheCase */
1549+ /* This REDAUX (2) case is needed in case string ends on "it's". See: testJacobEisensteinApostropheCase */
14891550{REDAUX} { String tok = yytext();
14901551 if (DEBUG ) { logger. info(" Used {REDAUX} (2) to recognize " + tok); }
14911552 return getNext(tok, tok);
14921553 }
1493- {SREDAUX} { String tok = yytext();
1494- String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
1495- if (DEBUG ) { logger. info(" Used {SREDAUX} (2) to recognize " + tok + " as " + norm +
1496- " ; probablyLeft=" + false ); }
1497- return getNext(norm, tok);
1498- }
1554+ /* Plain {REDAUX_NOT} is captured by {APOWORD} */
14991555
15001556{FAKEDUCKFEET} {
15011557 String tok = yytext();
0 commit comments