Skip to content

Commit c3605ab

Browse files
authored
Merge pull request jaybaird#17 from whylabs/dev/felipe/patterns
improve patterns and update test
2 parents 3159f8b + 0d7ffc6 commit c3605ab

File tree

2 files changed

+22
-10
lines changed

2 files changed

+22
-10
lines changed

langkit/pattern_groups.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
},
88
{
99
"expressions": [
10-
"\\b(?:4[0-9]{12}(?:[0-9]{3})?|[25][1-7][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\\d{3})\\d{11})\\b"
10+
"\\b(?:\\d[ -]*?){13,16}\\b"
1111
],
1212
"name": "credit card number"
1313
},
1414
{
1515
"expressions": [
16-
"\\b(?!000|.+0{4})(?:\\d{9}|\\d{3}-\\d{2}-\\d{4})\\b"
16+
"(?!(\\d){3}(-| |)\\1{2}\\2\\1{4})(?!666|000|9\\d{2})(\\b\\d{3}(-| |)(?!00)\\d{2}\\4(?!0{4})\\d{4}\\b)"
1717
],
1818
"name": "SSN"
1919
},
@@ -25,7 +25,7 @@
2525
},
2626
{
2727
"expressions": [
28-
"\\b\\d{1,8}\\b[\\s\\S]{10,100}?\\b(AK|AL|AR|AZ|CA|CO|CT|DC|DE|FL|GA|HI|IA|ID|IL|IN|KS|KY|LA|MA|MD|ME|MI|MN|MO|MS|MT|NC|ND|NE|NH|NJ|NM|NV|NY|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VA|VT|WA|WI|WV|WY)\\b\\s\\d{5}\\b"
28+
"\\b\\d+[ ](?:[A-Za-z0-9.-]+[ ]?)+(Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St|Sq)\\b"
2929
],
3030
"name": "mailing address"
3131
}

langkit/tests/test_patterns.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,22 @@ def ptt_df():
1414
df = pd.DataFrame(
1515
{
1616
"input": [
17-
"address: 123 Main St Anytown, NY 12345",
17+
"address: 123 Main St.",
18+
"2255 140th Ave NE",
19+
"535 Bellevue Sq",
20+
"15220 SE 37th St",
1821
"anemail@address.com",
1922
"my phone is +1 309-404-7587",
2023
"credit card 4556205848969759",
24+
"credit card 3851-6256-0926-7271",
25+
"Visa Card Number: 4929 5423 7528 1067 \nExpiration Date: 03/24 \nCVV: 348",
26+
"622202049892743 - this is a credit card number",
2127
"my ssn is 856-45-6789",
28+
"ssn - 702-02-9921",
29+
"ssn is 702 02 9921",
30+
"702029921 (SSN)",
31+
"no patterns here.",
2232
],
23-
"output": ["a", "b", "c", "d", "e"],
2433
}
2534
)
2635
return df
@@ -55,13 +64,16 @@ def test_ptt(ptt_df, user_defined_json):
5564
fi_input_list = result.view().to_pandas()[
5665
"udf/has_patterns:frequent_items/frequent_strings"
5766
]["input"]
58-
fi_output_list = result.view().to_pandas()[
59-
"udf/has_patterns:frequent_items/frequent_strings"
60-
]["output"]
6167
if not user_defined_json:
62-
group_names = {"phone number", "email address", "SSN", "mailing address", "credit card number"}
68+
group_names = {
69+
"",
70+
"credit card number",
71+
"email address",
72+
"SSN",
73+
"phone number",
74+
"mailing address",
75+
}
6376
else:
6477
group_names = {"custom_group", ""}
6578

6679
assert set([x.value for x in fi_input_list]) == group_names
67-
assert set([x.value for x in fi_output_list]) == {""}

0 commit comments

Comments
 (0)