Skip to content
This repository was archived by the owner on Nov 1, 2024. It is now read-only.

Commit 0f1b40b

Browse files
andrewaikens87facebook-github-bot
authored andcommitted
Improve icolumn.py cc 4/4 test_string_column (#414)
Summary: Pull Request resolved: #414 Improves test coverage for icolumn.py through test_string_column. icolumn.py has ~55% code coverage currently and this diff brings it to 78.3. Reviewed By: wenleix Differential Revision: D37289878 fbshipit-source-id: d7de16030ec77cdd39c06976305a9162be375b86
1 parent 2ee2f5c commit 0f1b40b

File tree

5 files changed

+247
-26
lines changed

5 files changed

+247
-26
lines changed

torcharrow/icolumn.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,9 @@ def __len__(self):
270270

271271
def __str__(self):
272272
item_padding = "'" if dt.is_string(self.dtype) else ""
273-
return f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}], id = {self.id})"
273+
return (
274+
f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}])"
275+
)
274276

275277
def __repr__(self):
276278
item_padding = "'" if dt.is_string(self.dtype) else ""
@@ -705,7 +707,7 @@ def filter(
705707
dtype: boolean, length: 2, null_count: 0
706708
"""
707709
if columns is not None:
708-
raise TypeError(f"columns parameter for flat columns not supported")
710+
raise TypeError("columns parameter for flat columns not supported")
709711

710712
if not isinstance(predicate, ty.Iterable) and not callable(predicate):
711713
raise TypeError(
@@ -1006,8 +1008,6 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
10061008
"""
10071009
self._prototype_support_warning("fill_null")
10081010

1009-
if not isinstance(fill_value, Column._scalar_types):
1010-
raise TypeError(f"fill_null with {type(fill_value)} is not supported")
10111011
if isinstance(fill_value, Column._scalar_types):
10121012
res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
10131013
for m, i in self._items():
@@ -1017,7 +1017,9 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
10171017
res._append_value(fill_value)
10181018
return res._finalize()
10191019
else:
1020-
raise TypeError(f"fill_null with {type(fill_value)} is not supported")
1020+
raise TypeError(
1021+
f"fill_null with {type(fill_value).__name__} is not supported"
1022+
)
10211023

10221024
@trace
10231025
@expression
@@ -1050,7 +1052,7 @@ def drop_null(self, how: ty.Literal["any", "all", None] = None):
10501052

10511053
if how is not None:
10521054
# "any or "all" is only used for DataFrame
1053-
raise TypeError(f"how parameter for flat columns not supported")
1055+
raise TypeError("how parameter for flat columns not supported")
10541056

10551057
if dt.is_primitive(self.dtype):
10561058
res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
@@ -1076,7 +1078,7 @@ def drop_duplicates(
10761078
# TODO Add functionality for first and last
10771079
assert keep == "first"
10781080
if subset is not None:
1079-
raise TypeError(f"subset parameter for flat columns not supported")
1081+
raise TypeError("subset parameter for flat columns not supported")
10801082
res = Scope._EmptyColumn(self._dtype)
10811083
res._extend(list(OrderedDict.fromkeys(self)))
10821084
return res._finalize()

torcharrow/test/test_numerical_column.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -839,9 +839,8 @@ def base_test_batch_collate(self):
839839

840840
def base_test_str(self):
841841
c = ta.column(list(range(5)), device=self.device)
842-
c.id = 123
843842

844-
expected = "Column([0, 1, 2, 3, 4], id = 123)"
843+
expected = "Column([0, 1, 2, 3, 4])"
845844
self.assertEqual(expected, str(c))
846845

847846
def base_test_repr(self):

torcharrow/test/test_string_column.py

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,195 @@ def base_test_regular_expressions(self):
345345
],
346346
)
347347

348+
def base_test_is_unique(self):
349+
unique_column = ta.column(
350+
[f"test{x}" for x in range(3)],
351+
device=self.device,
352+
)
353+
354+
self.assertTrue(unique_column.is_unique)
355+
356+
non_unique_column = ta.column(
357+
[
358+
"test",
359+
"test",
360+
],
361+
device=self.device,
362+
)
363+
364+
self.assertFalse(non_unique_column.is_unique)
365+
366+
def base_test_is_monotonic_increasing(self):
367+
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
368+
self.assertTrue(c.is_monotonic_increasing)
369+
self.assertFalse(c.is_monotonic_decreasing)
370+
371+
def base_test_is_monotonic_decreasing(self):
372+
c = ta.column([f"test{x}" for x in range(5, 0, -1)], device=self.device)
373+
self.assertFalse(c.is_monotonic_increasing)
374+
self.assertTrue(c.is_monotonic_decreasing)
375+
376+
def base_test_if_else(self):
377+
left_repr = ["a1", "a2", "a3", "a4"]
378+
right_repr = ["b1", "b2", "b3", "b4"]
379+
float_type = ta.column(
380+
[1.22, 2.22, 3.22, 4.22], dtype=dt.float32, device=self.device
381+
)
382+
cond_repr = [True, False, True, False]
383+
cond = ta.column(cond_repr, device=self.device)
384+
left = ta.column(left_repr, device=self.device)
385+
right = ta.column(right_repr, device=self.device)
386+
387+
# Ensure py-iterables work as intended
388+
expected = [left_repr[0], right_repr[1], left_repr[2], right_repr[3]]
389+
result = ta.if_else(cond, left_repr, right_repr)
390+
self.assertEqual(expected, list(result))
391+
392+
# Non common dtype
393+
with self.assertRaisesRegex(
394+
expected_exception=TypeError,
395+
expected_regex="then and else branches must have compatible types, got.*and.*, respectively",
396+
):
397+
ta.if_else(cond, left, float_type)
398+
399+
# Invalid condition input
400+
with self.assertRaisesRegex(
401+
expected_exception=TypeError,
402+
expected_regex="condition must be a boolean vector",
403+
):
404+
ta.if_else(
405+
cond=left,
406+
left=left,
407+
right=right,
408+
)
409+
410+
def base_test_str(self):
411+
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
412+
413+
expected = "Column(['test0', 'test1', 'test2', 'test3', 'test4'])"
414+
self.assertEqual(expected, str(c))
415+
416+
def base_test_repr(self):
417+
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
418+
419+
expected = (
420+
"0 'test0'\n"
421+
"1 'test1'\n"
422+
"2 'test2'\n"
423+
"3 'test3'\n"
424+
"4 'test4'\n"
425+
f"dtype: string, length: 5, null_count: 0, device: {self.device}"
426+
)
427+
self.assertEqual(expected, repr(c))
428+
429+
def base_test_is_valid_at(self):
430+
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
431+
432+
self.assertTrue(all(c.is_valid_at(x) for x in range(5)))
433+
434+
def base_test_cast(self):
435+
c_repr = ["0", "1", "2", "3", "4", None]
436+
c_repr_after_cast = [0, 1, 2, 3, 4, None]
437+
c = ta.column(c_repr, device=self.device)
438+
439+
result = c.cast(dt.int64)
440+
self.assertEqual(c_repr_after_cast, list(result))
441+
442+
def base_test_drop_null(self):
443+
c_repr = ["0", "1", "2", "3", "4", None]
444+
c = ta.column(c_repr, device=self.device)
445+
446+
result = c.drop_null()
447+
448+
self.assertEqual(c_repr[:-1], list(result))
449+
450+
with self.assertRaisesRegex(
451+
expected_exception=TypeError,
452+
expected_regex="how parameter for flat columns not supported",
453+
):
454+
c.drop_null(how="any")
455+
456+
def base_test_drop_duplicates(self):
457+
c_repr = ["test", "test2", "test3", "test"]
458+
c = ta.column(c_repr, device=self.device)
459+
460+
result = c.drop_duplicates()
461+
462+
self.assertEqual(c_repr[:-1], list(result))
463+
464+
# TODO: Add functionality for last
465+
with self.assertRaises(expected_exception=AssertionError):
466+
c.drop_duplicates(keep="last")
467+
468+
with self.assertRaisesRegex(
469+
expected_exception=TypeError,
470+
expected_regex="subset parameter for flat columns not supported",
471+
):
472+
c.drop_duplicates(subset=c_repr[:2])
473+
474+
def base_test_fill_null(self):
475+
c_repr = ["0", "1", None, "3", "4", None]
476+
expected_fill = "TEST"
477+
expected_repr = ["0", "1", expected_fill, "3", "4", expected_fill]
478+
c = ta.column(c_repr, device=self.device)
479+
480+
result = c.fill_null(expected_fill)
481+
482+
self.assertEqual(expected_repr, list(result))
483+
484+
with self.assertRaisesRegex(
485+
expected_exception=TypeError,
486+
expected_regex="fill_null with bytes is not supported",
487+
):
488+
c.fill_null(expected_fill.encode())
489+
490+
def base_test_isin(self):
491+
c_repr = [f"test{x}" for x in range(5)]
492+
c = ta.column(c_repr, device=self.device)
493+
self.assertTrue(all(c.isin(values=c_repr + ["test_123"])))
494+
self.assertFalse(any(c.isin(values=["test5", "test6", "test7"])))
495+
496+
def base_test_bool(self):
497+
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
498+
with self.assertRaisesRegex(
499+
expected_exception=ValueError,
500+
expected_regex=r"The truth value of a.*is ambiguous. Use a.any\(\) or a.all\(\).",
501+
):
502+
bool(c)
503+
504+
def base_test_flatmap(self):
505+
c = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
506+
expected_result = [
507+
"test1",
508+
"test1",
509+
"test2",
510+
"test2",
511+
None,
512+
None,
513+
None,
514+
None,
515+
"test3",
516+
"test3",
517+
]
518+
result = c.flatmap(lambda xs: [xs, xs])
519+
self.assertEqual(expected_result, list(result))
520+
521+
def base_test_any(self):
522+
c_some = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
523+
c_none = ta.column([], dtype=dt.string, device=self.device)
524+
c_none = c_none.append([None])
525+
self.assertTrue(c_some.any())
526+
self.assertFalse(c_none.any())
527+
528+
def base_test_all(self):
529+
c_all = ta.column(["test", "test2", "test3"], device=self.device)
530+
c_partial = ta.column(["test", "test2", None, None], device=self.device)
531+
c_none = ta.column([], dtype=dt.string, device=self.device)
532+
c_none = c_none.append([None])
533+
self.assertTrue(c_all.all())
534+
self.assertTrue(c_partial.all())
535+
self.assertTrue(c_none.all())
536+
348537

349538
if __name__ == "__main__":
350539
unittest.main()

torcharrow/test/test_string_column_cpu.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,54 @@ def test_string_pattern_matching_methods(self):
5757
def test_regular_expressions(self):
5858
self.base_test_regular_expressions()
5959

60+
def test_is_unique(self):
61+
self.base_test_is_unique()
62+
63+
def test_is_monotonic_increasing(self):
64+
self.base_test_is_monotonic_increasing()
65+
66+
def test_is_monotonic_decreasing(self):
67+
self.base_test_is_monotonic_decreasing()
68+
69+
def test_if_else(self):
70+
self.base_test_if_else()
71+
72+
def test_repr(self):
73+
self.base_test_repr()
74+
75+
def test_str(self):
76+
self.base_test_str()
77+
78+
def test_is_valid_at(self):
79+
self.base_test_is_valid_at()
80+
81+
def test_cast(self):
82+
self.base_test_cast()
83+
84+
def test_drop_null(self):
85+
self.base_test_drop_null()
86+
87+
def test_drop_duplicates(self):
88+
self.base_test_drop_duplicates()
89+
90+
def test_fill_null(self):
91+
self.base_test_fill_null()
92+
93+
def test_isin(self):
94+
self.base_test_isin()
95+
96+
def test_bool(self):
97+
self.base_test_bool()
98+
99+
def test_flatmap(self):
100+
self.base_test_flatmap()
101+
102+
def test_any(self):
103+
self.base_test_any()
104+
105+
def test_all(self):
106+
self.base_test_all()
107+
60108

61109
if __name__ == "__main__":
62110
unittest.main()

torcharrow/velox_rt/string_column_cpu.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -187,23 +187,6 @@ def __gt__(self, other):
187187
def __ge__(self, other):
188188
return self._checked_binary_op_call(other, "gte")
189189

190-
# printing ----------------------------------------------------------------
191-
192-
def __str__(self):
193-
def quote(x):
194-
return f"'{x}'"
195-
196-
return f"Column([{', '.join('None' if i is None else quote(i) for i in self)}])"
197-
198-
def __repr__(self):
199-
tab = tabulate(
200-
[["None" if i is None else f"'{i}'"] for i in self],
201-
tablefmt="plain",
202-
showindex=True,
203-
)
204-
typ = f"dtype: {self.dtype}, length: {self.length}, null_count: {self.null_count}, device: cpu"
205-
return tab + dt.NL + typ
206-
207190
# interop
208191
def _to_tensor_default(self):
209192
# there are no string tensors, so we're using regular python list conversion

0 commit comments

Comments
 (0)