@@ -769,7 +769,8 @@ HYPHEN = [-\u058A\u2010\u2011\u2012]
769769HYPHENS = {HYPHEN} +
770770SSN = [ 0- 9] {3} {HYPHEN} [ 0- 9] {2} {HYPHEN} [ 0- 9] {4}
771771/* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. And for new treebank tokenization 346-8792. 1st digit can't be 0 or 1 in NANP. */
772- PHONE = ( \( [ 0- 9] {2,3} \) [ \u00A0\u2007 ] ?|( \+\+ ?)?( [ 0- 9] {1,4} [ \- \u00A0\u2007\u2012 ] )? [ 0- 9] {2,4} [ \- \u00A0\u2007\u2012 /] ) [ 0- 9] {3,4} [ \- \u00A0\u2007\u2012 ] ? [ 0- 9] {3,5}|(( \+\+ ?)? [ 0- 9] {1,4} \. )? [ 0- 9] {2,4} \. [ 0- 9] {3,4} \. [ 0- 9] {3,5}| [ 2- 9][ 0- 9] {2} [- \u2012 ][ 0- 9] {4}
772+ /* 2022: Also allow hyphen between area code and number; allow French number like 47-42-17-11 */
773+ PHONE = ( \( [ 0- 9] {2,3} \) [- \u00A0\u2007 ] ?|( \+\+ ?)?( [ 0- 9] {1,4} [- \u00A0\u2007\u2012 ] )? [ 0- 9] {2,4} [- \u00A0\u2007\u2012 /] ) [ 0- 9] {3,4} [- \u00A0\u2007\u2012 ] ? [ 0- 9] {3,5}|(( \+\+ ?)? [ 0- 9] {1,4} \. )? [ 0- 9] {2,4} \. [ 0- 9] {2,4} \. [ 0- 9] {2,5}|(( \+\+ ?)? [ 0- 9] {1,4} -)? [ 0- 9] {2,4} -[ 0- 9] {2,4} -[ 0- 9] {2,5}| [ 2- 9][ 0- 9] {2} [- \u2012 ][ 0- 9] {4}
773774/* Fake duck feet appear sometimes in WSJ, and aren't likely to be SGML, less than, etc., so group. */
774775FAKEDUCKFEET = <<| >>
775776LESSTHAN = <| <
@@ -1356,8 +1357,8 @@ RM/{NUM} { String txt = yytext();
13561357 if (escapeForwardSlashAsterisk) {
13571358 String normTok = LexerUtils . escapeChar(yytext(), ' *' );
13581359 if (DEBUG ) { logger. info(" Used {ASTS} to recognize " + txt + " as " + normTok); }
1359- return getNext(normTok, yytext()); }
1360- else {
1360+ return getNext(normTok, txt);
1361+ } else {
13611362 if (DEBUG ) { logger. info(" Used {ASTS} to recognize " + txt); }
13621363 return getNext(txt, txt);
13631364 }
@@ -1382,20 +1383,55 @@ RM/{NUM} { String txt = yytext();
13821383 if (DEBUG ) { logger. info(" Used {=} to recognize " + txt); }
13831384 return getNext(txt, txt);
13841385 }
1385- \/ { if (escapeForwardSlashAsterisk) {
1386- return getNext(LexerUtils . escapeChar(yytext(), ' /' ), yytext()); }
1387- else {
1388- return getNext();
1386+ \/ {
1387+ String txt = yytext();
1388+ if (escapeForwardSlashAsterisk) {
1389+ String normTok = LexerUtils . escapeChar(yytext(), ' /' );
1390+ if (DEBUG ) { logger. info(" Used {/} to recognize " + txt + " as " + normTok); }
1391+ return getNext(normTok, txt);
1392+ } else {
1393+ if (DEBUG ) { logger. info(" Used {/} to recognize " + txt); }
1394+ return getNext(txt, txt);
13891395 }
13901396 }
13911397/* {HTHING}/[^\p{Alpha}\p{Digit}.+] { return getNext(LexerUtils.removeSoftHyphens(yytext()),
13921398 yytext()); } */
1393- {HTHINGEXCEPTIONWHOLE} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1394- {HTHINGEXCEPTIONWHOLE} \. / {INSENTP} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1395- {HTHINGEXCEPTIONPREFIXED} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1396- {HTHINGEXCEPTIONPREFIXED} \. / {INSENTP} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1397- {HTHINGEXCEPTIONSUFFIXED} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1398- {HTHINGEXCEPTIONSUFFIXED} \. / {INSENTP} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1399+ {HTHINGEXCEPTIONWHOLE} {
1400+ String tok = yytext();
1401+ String norm = LexerUtils . removeSoftHyphens(tok);
1402+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONWHOLE} to recognize " + tok + " as " + norm); }
1403+ return getNext(norm, tok);
1404+ }
1405+ {HTHINGEXCEPTIONWHOLE} \. / {INSENTP} {
1406+ String tok = yytext();
1407+ String norm = LexerUtils . removeSoftHyphens(tok);
1408+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONWHOLE} (2) to recognize " + tok + " as " + norm); }
1409+ return getNext(norm, tok);
1410+ }
1411+ {HTHINGEXCEPTIONPREFIXED} {
1412+ String tok = yytext();
1413+ String norm = LexerUtils . removeSoftHyphens(tok);
1414+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONPREFIXED} to recognize " + tok + " as " + norm); }
1415+ return getNext(norm, tok);
1416+ }
1417+ {HTHINGEXCEPTIONPREFIXED} \. / {INSENTP} {
1418+ String tok = yytext();
1419+ String norm = LexerUtils . removeSoftHyphens(tok);
1420+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONPREFIXED} (2) to recognize " + tok + " as " + norm); }
1421+ return getNext(norm, tok);
1422+ }
1423+ {HTHINGEXCEPTIONSUFFIXED} {
1424+ String tok = yytext();
1425+ String norm = LexerUtils . removeSoftHyphens(tok);
1426+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONSUFFIXED} to recognize " + tok + " as " + norm); }
1427+ return getNext(norm, tok);
1428+ }
1429+ {HTHINGEXCEPTIONSUFFIXED} \. / {INSENTP} {
1430+ String tok = yytext();
1431+ String norm = LexerUtils . removeSoftHyphens(tok);
1432+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONSUFFIXED} (2) to recognize " + tok + " as " + norm); }
1433+ return getNext(norm, tok);
1434+ }
13991435{HTHING} { String tok = yytext();
14001436 breakByHyphensSlashes(tok);
14011437 tok = yytext();
@@ -1492,7 +1528,11 @@ RM/{NUM} { String txt = yytext();
14921528 return getNext(norm, tok);
14931529 }
14941530
1495- {FAKEDUCKFEET} { return getNext(); }
1531+ {FAKEDUCKFEET} {
1532+ String tok = yytext();
1533+ if (DEBUG ) { logger. info(" Used {FAKEDUCKFEET} to recognize " + tok); }
1534+ return getNext(tok, tok);
1535+ }
14961536{MISCSYMBOL} {
14971537 String tok = yytext();
14981538 if (DEBUG ) { logger. info(" Used {MISCSYMBOL} to recognize " + tok); }
0 commit comments