Skip to content

Commit c6699b5

Browse files
committed
updated DomainTrie to handle overlapping domains in the list
1 parent 3f002ca commit c6699b5

File tree

2 files changed

+23
-9
lines changed

2 files changed

+23
-9
lines changed

ml/utils/domain_trie.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,17 @@ def _add_node(domain_pieces, tree, idx=0):
5151
if idx >= len(domain_pieces):
5252
tree['.'] = None
5353
return
54-
# stop at "." because it is only building topper levels
54+
part = domain_pieces[idx]
55+
node = tree.get(part)
56+
if node is None:
57+
tree[part] = {} # adding a new node
58+
# NOTE: it is possible to have both "a.b.com" and "b.com" in the list,
59+
# and the order in domains list is not guaranteed.
5560
if "." not in tree:
56-
part = domain_pieces[idx]
57-
node = tree.get(part, {})
58-
if node == {}:
59-
tree[part] = node # adding a new node
60-
DomainTrie._add_node(domain_pieces, node, idx + 1)
61+
DomainTrie._add_node(domain_pieces, tree[part], idx + 1)
62+
else:
63+
# only need to keep higher level domains
64+
del tree[part]
6165

6266
@staticmethod
6367
def _build_tree(domains):
@@ -68,7 +72,7 @@ def _build_tree(domains):
6872
if len(domain_pieces) > 1:
6973
DomainTrie._add_node(domain_pieces[::-1], tree)
7074
else:
71-
LOGGER.warn('Invalid domain: "%s"', domain)
75+
LOGGER.warning('Invalid domain: "%s"', domain)
7276
return tree
7377

7478
@staticmethod

tests/test_utils_domain_trie.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@ def test_domain_trie(self):
1414
'xn--2342390.cn',
1515
'google.com',
1616
'test.google.com',
17+
'abc.def.ghi.xyz.com',
18+
'xyz.com',
19+
'example.stuff'
20+
]
21+
expected = [
22+
'132.22.21.47',
23+
'xn--2342390.cn',
24+
'google.com',
25+
'xyz.com',
1726
'example.stuff'
1827
]
1928

@@ -26,16 +35,17 @@ def test_domain_trie(self):
2635
self.assertIn('goo.example.stuff', trie)
2736
self.assertIn('google.com', trie)
2837
self.assertIn('test.google.com', trie)
38+
self.assertIn('test.xyz.com', trie)
2939

3040
self.assertNotIn('133.22.21.47', trie)
3141
self.assertNotIn('googl.com', trie)
3242
self.assertNotIn('google.api.com', trie)
3343
self.assertNotIn('google.cn', trie)
44+
self.assertNotIn('xyz.ghi.com', trie)
3445
self.assertNotIn('stuff', trie)
3546

36-
domains.remove('test.google.com')
3747
result = ast.literal_eval(str(trie))
38-
self.assertEqual(result, domains)
48+
self.assertEqual(result, expected)
3949

4050
def test_domain_trie_empty(self):
4151
domains = [

0 commit comments

Comments
 (0)