|
1 | | -import pytest |
| 1 | +from unittest.mock import MagicMock |
| 2 | +import os |
| 3 | +import sys |
| 4 | + |
| 5 | +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'script')) |
| 6 | +from is_table_sorted import are_parquet_file_row_groups_sorted |
| 7 | + |
| 8 | + |
| 9 | +def _create_mock_parquet_file(column_name: str, row_groups_stats: list[tuple]): |
| 10 | + """ |
| 11 | + Helper to create a mock ParquetFile with specified row group statistics. |
| 12 | +
|
| 13 | + Args: |
| 14 | + column_name: Name of the column to sort by |
| 15 | + row_groups_stats: List of (min, max) tuples for each row group |
| 16 | + """ |
| 17 | + mock_pf = MagicMock() |
| 18 | + mock_pf.schema.names = [column_name, 'data'] |
| 19 | + mock_pf.num_row_groups = len(row_groups_stats) |
| 20 | + |
| 21 | + mock_row_groups = [] |
| 22 | + for min_val, max_val in row_groups_stats: |
| 23 | + mock_row_group = MagicMock() |
| 24 | + mock_column = MagicMock() |
| 25 | + mock_column.statistics.min = min_val |
| 26 | + mock_column.statistics.max = max_val |
| 27 | + mock_row_group.column.return_value = mock_column |
| 28 | + mock_row_groups.append(mock_row_group) |
| 29 | + |
| 30 | + mock_pf.metadata.row_group.side_effect = lambda i: mock_row_groups[i] |
| 31 | + return mock_pf |
| 32 | + |
| 33 | + |
| 34 | +# Tests for sorted row groups |
| 35 | + |
| 36 | +def test_single_row_group_sorted(): |
| 37 | + """Test with a single row group (trivially sorted)""" |
| 38 | + mock_pf = _create_mock_parquet_file( |
| 39 | + 'url_surtkey', |
| 40 | + [('com,example)/page1', 'com,example)/page3')] |
| 41 | + ) |
| 42 | + |
| 43 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 44 | + |
| 45 | + assert is_sorted is True |
| 46 | + assert min_val is not None |
| 47 | + assert max_val is not None |
| 48 | + |
| 49 | + |
| 50 | +def test_multiple_row_groups_strictly_increasing(): |
| 51 | + """Test with multiple row groups in strictly increasing order""" |
| 52 | + mock_pf = _create_mock_parquet_file( |
| 53 | + 'url_surtkey', |
| 54 | + [ |
| 55 | + ('com,aaa)/', 'com,bbb)/'), |
| 56 | + ('com,ccc)/', 'com,ddd)/'), |
| 57 | + ('com,eee)/', 'com,fff)/') |
| 58 | + ] |
| 59 | + ) |
| 60 | + |
| 61 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 62 | + |
| 63 | + assert is_sorted is True |
| 64 | + assert min_val is not None |
| 65 | + assert max_val is not None |
| 66 | + |
| 67 | + |
| 68 | +def test_boundary_case_adjacent_values(): |
| 69 | + """Test with row groups that have adjacent but non-overlapping values""" |
| 70 | + mock_pf = _create_mock_parquet_file( |
| 71 | + 'url', |
| 72 | + [ |
| 73 | + ('com,example)/a', 'com,example)/z'), |
| 74 | + ('com,example,aaa)/', 'com,example,zzz)/') |
| 75 | + ] |
| 76 | + ) |
| 77 | + |
| 78 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url') |
| 79 | + |
| 80 | + assert is_sorted is True |
| 81 | + assert min_val is not None |
| 82 | + assert max_val is not None |
| 83 | + |
| 84 | + |
| 85 | +def test_two_row_groups_strictly_increasing_strings(): |
| 86 | + """Test with two row groups with string values in strictly increasing order""" |
| 87 | + mock_pf = _create_mock_parquet_file( |
| 88 | + 'url_surtkey', |
| 89 | + [ |
| 90 | + ('com,apple)/', 'com,banana)/'), |
| 91 | + ('com,cherry)/', 'com,date)/') |
| 92 | + ] |
| 93 | + ) |
| 94 | + |
| 95 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 96 | + |
| 97 | + assert is_sorted is True |
| 98 | + assert min_val is not None |
| 99 | + assert max_val is not None |
| 100 | + |
| 101 | + |
| 102 | +def test_many_row_groups_strictly_increasing(): |
| 103 | + """Test with many row groups, all strictly increasing""" |
| 104 | + row_groups = [ |
| 105 | + ('com,aaa)/', 'com,aaa,zzz)/'), |
| 106 | + ('com,bbb)/', 'com,bbb,zzz)/'), |
| 107 | + ('com,ccc)/', 'com,ccc,zzz)/'), |
| 108 | + ('com,ddd)/', 'com,ddd,zzz)/'), |
| 109 | + ('com,eee)/', 'com,eee,zzz)/'), |
| 110 | + ] |
| 111 | + mock_pf = _create_mock_parquet_file('url_surtkey', row_groups) |
| 112 | + |
| 113 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 114 | + |
| 115 | + assert is_sorted is True |
| 116 | + assert min_val is not None |
| 117 | + assert max_val is not None |
| 118 | + |
| 119 | + |
| 120 | +# Tests for non-sorted row groups |
| 121 | + |
| 122 | +def test_two_row_groups_overlapping(): |
| 123 | + """Test with two row groups where second min is less than first max (overlapping)""" |
| 124 | + mock_pf = _create_mock_parquet_file( |
| 125 | + 'url_surtkey', |
| 126 | + [ |
| 127 | + ('a', 'd'), |
| 128 | + ('b', 'e') |
| 129 | + ] |
| 130 | + ) |
| 131 | + |
| 132 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 133 | + |
| 134 | + assert is_sorted is False |
| 135 | + assert min_val is None |
| 136 | + assert max_val is None |
| 137 | + |
| 138 | + |
| 139 | +def test_row_groups_completely_out_of_order(): |
| 140 | + """Test with row groups in descending order""" |
| 141 | + mock_pf = _create_mock_parquet_file( |
| 142 | + 'url_surtkey', |
| 143 | + [ |
| 144 | + ('z', 'zz'), |
| 145 | + ('a', 'b') # completely before the first group |
| 146 | + ] |
| 147 | + ) |
| 148 | + |
| 149 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 150 | + |
| 151 | + assert is_sorted is False |
| 152 | + assert min_val is None |
| 153 | + assert max_val is None |
| 154 | + |
| 155 | + |
| 156 | +def test_multiple_row_groups_with_middle_unsorted(): |
| 157 | + """Test with multiple row groups where the middle one breaks the sort order""" |
| 158 | + mock_pf = _create_mock_parquet_file( |
| 159 | + 'url_surtkey', |
| 160 | + [ |
| 161 | + ('a', 'b'), |
| 162 | + ('z', 'zz'), # correctly sorted so far |
| 163 | + ('c', 'd') # breaks ordering (min 'c' < previous max 'zz') |
| 164 | + ] |
| 165 | + ) |
| 166 | + |
| 167 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 168 | + |
| 169 | + assert is_sorted is False |
| 170 | + assert min_val is None |
| 171 | + assert max_val is None |
| 172 | + |
| 173 | + |
| 174 | +def test_row_groups_equal_boundary_allowed(): |
| 175 | + """Test that row groups where second min equals first max are allowed (>= not >)""" |
| 176 | + mock_pf = _create_mock_parquet_file( |
| 177 | + 'url_surtkey', |
| 178 | + [ |
| 179 | + ('a', 'b'), |
| 180 | + ('b', 'c') # min equals prev_max - this is allowed |
| 181 | + ] |
| 182 | + ) |
| 183 | + |
| 184 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 185 | + |
| 186 | + assert is_sorted is True |
| 187 | + assert min_val is not None |
| 188 | + assert max_val is not None |
| 189 | + |
| 190 | + |
| 191 | +def test_slight_overlap_in_middle(): |
| 192 | + """Test detecting overlap in the middle of many row groups""" |
| 193 | + mock_pf = _create_mock_parquet_file( |
| 194 | + 'url_surtkey', |
| 195 | + [ |
| 196 | + ('a', 'az'), |
| 197 | + ('b', 'bz'), |
| 198 | + ('c', 'cz'), |
| 199 | + ('ba', 'baz'), # overlaps with previous ('ba' < 'c') |
| 200 | + ('d', 'dz'), |
| 201 | + ] |
| 202 | + ) |
| 203 | + |
| 204 | + is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey') |
| 205 | + |
| 206 | + assert is_sorted is False |
| 207 | + assert min_val is None |
| 208 | + assert max_val is None |
2 | 209 |
|
3 | 210 |
|
0 commit comments