@@ -14,13 +14,22 @@ def ptt_df():
1414 df = pd .DataFrame (
1515 {
1616 "input" : [
17- "address: 123 Main St Anytown, NY 12345" ,
17+ "address: 123 Main St." ,
18+ "2255 140th Ave NE" ,
19+ "535 Bellevue Sq" ,
20+ "15220 SE 37th St" ,
1821 "anemail@address.com" ,
1922 "my phone is +1 309-404-7587" ,
2023 "credit card 4556205848969759" ,
24+ "credit card 3851-6256-0926-7271" ,
25+ "Visa Card Number: 4929 5423 7528 1067 \n Expiration Date: 03/24 \n CVV: 348" ,
26+ "622202049892743 - this is a credit card number" ,
2127 "my ssn is 856-45-6789" ,
28+ "ssn - 702-02-9921" ,
29+ "ssn is 702 02 9921" ,
30+ "702029921 (SSN)" ,
31+ "no patterns here." ,
2232 ],
23- "output" : ["a" , "b" , "c" , "d" , "e" ],
2433 }
2534 )
2635 return df
@@ -55,13 +64,16 @@ def test_ptt(ptt_df, user_defined_json):
5564 fi_input_list = result .view ().to_pandas ()[
5665 "udf/has_patterns:frequent_items/frequent_strings"
5766 ]["input" ]
58- fi_output_list = result .view ().to_pandas ()[
59- "udf/has_patterns:frequent_items/frequent_strings"
60- ]["output" ]
6167 if not user_defined_json :
62- group_names = {"phone number" , "email address" , "SSN" , "mailing address" , "credit card number" }
68+ group_names = {
69+ "" ,
70+ "credit card number" ,
71+ "email address" ,
72+ "SSN" ,
73+ "phone number" ,
74+ "mailing address" ,
75+ }
6376 else :
6477 group_names = {"custom_group" , "" }
6578
6679 assert set ([x .value for x in fi_input_list ]) == group_names
67- assert set ([x .value for x in fi_output_list ]) == {"" }
0 commit comments