Skip to content

Commit 0b229d9

Browse files
committed
wip tests
1 parent 4d4038c commit 0b229d9

File tree

2 files changed

+210
-3
lines changed

2 files changed

+210
-3
lines changed

src/script/is_table_sorted.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ def are_parquet_file_row_groups_sorted(pf: pq.ParquetFile, column_name: str) ->
2525
# internally unsorted
2626
print(f"row group {row_group_index} is not sorted on {column_name}: '{column.statistics.min}' <= '{prev_max}' ; stopping")
2727
return False, None, None
28-
whole_min = column.statistics.min if whole_min is None else column.statistics.min
29-
whole_max = column.statistics.max if whole_max is None else column.statistics.max
28+
whole_min = column.statistics.min if whole_min is None else min(column.statistics.min, whole_min)
29+
whole_max = column.statistics.max if whole_max is None else max(column.statistics.max, whole_max)
3030
return True, whole_min, whole_max
3131

3232

src/test/test_is_table_sorted.py

Lines changed: 208 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,210 @@
1-
import pytest
1+
from unittest.mock import MagicMock
2+
import os
3+
import sys
4+
5+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'script'))
6+
from is_table_sorted import are_parquet_file_row_groups_sorted
7+
8+
9+
def _create_mock_parquet_file(column_name: str, row_groups_stats: list[tuple]):
10+
"""
11+
Helper to create a mock ParquetFile with specified row group statistics.
12+
13+
Args:
14+
column_name: Name of the column to sort by
15+
row_groups_stats: List of (min, max) tuples for each row group
16+
"""
17+
mock_pf = MagicMock()
18+
mock_pf.schema.names = [column_name, 'data']
19+
mock_pf.num_row_groups = len(row_groups_stats)
20+
21+
mock_row_groups = []
22+
for min_val, max_val in row_groups_stats:
23+
mock_row_group = MagicMock()
24+
mock_column = MagicMock()
25+
mock_column.statistics.min = min_val
26+
mock_column.statistics.max = max_val
27+
mock_row_group.column.return_value = mock_column
28+
mock_row_groups.append(mock_row_group)
29+
30+
mock_pf.metadata.row_group.side_effect = lambda i: mock_row_groups[i]
31+
return mock_pf
32+
33+
34+
# Tests for sorted row groups
35+
36+
def test_single_row_group_sorted():
37+
"""Test with a single row group (trivially sorted)"""
38+
mock_pf = _create_mock_parquet_file(
39+
'url_surtkey',
40+
[('com,example)/page1', 'com,example)/page3')]
41+
)
42+
43+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
44+
45+
assert is_sorted is True
46+
assert min_val is not None
47+
assert max_val is not None
48+
49+
50+
def test_multiple_row_groups_strictly_increasing():
51+
"""Test with multiple row groups in strictly increasing order"""
52+
mock_pf = _create_mock_parquet_file(
53+
'url_surtkey',
54+
[
55+
('com,aaa)/', 'com,bbb)/'),
56+
('com,ccc)/', 'com,ddd)/'),
57+
('com,eee)/', 'com,fff)/')
58+
]
59+
)
60+
61+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
62+
63+
assert is_sorted is True
64+
assert min_val is not None
65+
assert max_val is not None
66+
67+
68+
def test_boundary_case_adjacent_values():
69+
"""Test with row groups that have adjacent but non-overlapping values"""
70+
mock_pf = _create_mock_parquet_file(
71+
'url',
72+
[
73+
('com,example)/a', 'com,example)/z'),
74+
('com,example,aaa)/', 'com,example,zzz)/')
75+
]
76+
)
77+
78+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url')
79+
80+
assert is_sorted is True
81+
assert min_val is not None
82+
assert max_val is not None
83+
84+
85+
def test_two_row_groups_strictly_increasing_strings():
86+
"""Test with two row groups with string values in strictly increasing order"""
87+
mock_pf = _create_mock_parquet_file(
88+
'url_surtkey',
89+
[
90+
('com,apple)/', 'com,banana)/'),
91+
('com,cherry)/', 'com,date)/')
92+
]
93+
)
94+
95+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
96+
97+
assert is_sorted is True
98+
assert min_val is not None
99+
assert max_val is not None
100+
101+
102+
def test_many_row_groups_strictly_increasing():
103+
"""Test with many row groups, all strictly increasing"""
104+
row_groups = [
105+
('com,aaa)/', 'com,aaa,zzz)/'),
106+
('com,bbb)/', 'com,bbb,zzz)/'),
107+
('com,ccc)/', 'com,ccc,zzz)/'),
108+
('com,ddd)/', 'com,ddd,zzz)/'),
109+
('com,eee)/', 'com,eee,zzz)/'),
110+
]
111+
mock_pf = _create_mock_parquet_file('url_surtkey', row_groups)
112+
113+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
114+
115+
assert is_sorted is True
116+
assert min_val is not None
117+
assert max_val is not None
118+
119+
120+
# Tests for non-sorted row groups
121+
122+
def test_two_row_groups_overlapping():
123+
"""Test with two row groups where second min is less than first max (overlapping)"""
124+
mock_pf = _create_mock_parquet_file(
125+
'url_surtkey',
126+
[
127+
('a', 'd'),
128+
('b', 'e')
129+
]
130+
)
131+
132+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
133+
134+
assert is_sorted is False
135+
assert min_val is None
136+
assert max_val is None
137+
138+
139+
def test_row_groups_completely_out_of_order():
140+
"""Test with row groups in descending order"""
141+
mock_pf = _create_mock_parquet_file(
142+
'url_surtkey',
143+
[
144+
('z', 'zz'),
145+
('a', 'b') # completely before the first group
146+
]
147+
)
148+
149+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
150+
151+
assert is_sorted is False
152+
assert min_val is None
153+
assert max_val is None
154+
155+
156+
def test_multiple_row_groups_with_middle_unsorted():
157+
"""Test with multiple row groups where the middle one breaks the sort order"""
158+
mock_pf = _create_mock_parquet_file(
159+
'url_surtkey',
160+
[
161+
('a', 'b'),
162+
('z', 'zz'), # correctly sorted so far
163+
('c', 'd') # breaks ordering (min 'c' < previous max 'zz')
164+
]
165+
)
166+
167+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
168+
169+
assert is_sorted is False
170+
assert min_val is None
171+
assert max_val is None
172+
173+
174+
def test_row_groups_equal_boundary_allowed():
175+
"""Test that row groups where second min equals first max are allowed (>= not >)"""
176+
mock_pf = _create_mock_parquet_file(
177+
'url_surtkey',
178+
[
179+
('a', 'b'),
180+
('b', 'c') # min equals prev_max - this is allowed
181+
]
182+
)
183+
184+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
185+
186+
assert is_sorted is True
187+
assert min_val is not None
188+
assert max_val is not None
189+
190+
191+
def test_slight_overlap_in_middle():
192+
"""Test detecting overlap in the middle of many row groups"""
193+
mock_pf = _create_mock_parquet_file(
194+
'url_surtkey',
195+
[
196+
('a', 'az'),
197+
('b', 'bz'),
198+
('c', 'cz'),
199+
('ba', 'baz'), # overlaps with previous ('ba' < 'c')
200+
('d', 'dz'),
201+
]
202+
)
203+
204+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
205+
206+
assert is_sorted is False
207+
assert min_val is None
208+
assert max_val is None
2209

3210

0 commit comments

Comments
 (0)