Enable single column str for drop/drop_duplicates/groupby

waitingkuo · wenleix · commit a700ae1da664 · 2022-07-11T11:13:10.000-07:00
Also update _check_columns to ensure the input columns is always a sequence of str
diff --git a/torcharrow/idataframe.py b/torcharrow/idataframe.py
@@ -425,7 +425,7 @@ def any(self):
     # column alnternating
     @trace
     @expression
-    def drop(self, columns: List[str]):
+    def drop(self, columns: Union[str, List[str]]):
         """
         Returns DataFrame without the removed columns.
         """
diff --git a/torcharrow/test/test_dataframe.py b/torcharrow/test/test_dataframe.py
@@ -833,6 +833,24 @@ def base_test_describe_dataframe(self):
             ],
         )
 
+    def base_test_drop_by_str_as_columns(self):
+        df = ta.dataframe(device=self.device)
+        df["aa"] = [1, 2, 3]
+        df["ab"] = [11, 22, 33]
+        df["ac"] = [111, 222, 333]
+        self.assertEqual(list(df.drop("aa")), [(11, 111), (22, 222), (33, 333)])
+        self.assertEqual(list(df.drop("ab")), [(1, 111), (2, 222), (3, 333)])
+        self.assertEqual(list(df.drop("ac")), [(1, 11), (2, 22), (3, 33)])
+
+    def base_test_drop_by_list_of_str_as_columns(self):
+        df = ta.dataframe(device=self.device)
+        df["aa"] = [1, 2, 3]
+        df["ab"] = [11, 22, 33]
+        df["ac"] = [111, 222, 333]
+        self.assertEqual(list(df.drop(["aa", "ab"])), [(111,), (222,), (333,)])
+        self.assertEqual(list(df.drop(["aa", "ac"])), [(11,), (22,), (33,)])
+        self.assertEqual(list(df.drop(["ab", "ac"])), [(1,), (2,), (3,)])
+
     def base_test_drop_keep_rename_reorder_pipe(self):
         df = ta.dataframe(device=self.device)
         df["a"] = [1, 2, 3]
@@ -895,6 +913,21 @@ def base_test_locals_and_me_equivalence(self):
         )
         self.assertEqual(list(df.select("*", d=me["a"] + me["b"])), list(gf))
 
+
+    def base_test_groupby_str(self):
+        df = ta.dataframe(
+            {"a": [1, 1, 2], "b": [1, 2, 3], "c": [2, 2, 1]}, device=self.device
+        )
+        self.assertEqual(list(df.groupby("a").size), [(1, 2), (2, 1)])
+
+
+    def base_test_groupby_list_of_str(self):
+        df = ta.dataframe(
+            {"a": [1, 1, 2], "b": [1, 2, 3], "c": [2, 2, 1]}, device=self.device
+        )
+        self.assertEqual(list(df.groupby(["a"]).size), [(1, 2), (2, 1)])
+
+
     def base_test_groupby_size_pipe(self):
         df = ta.dataframe(
             {"a": [1, 1, 2], "b": [1, 2, 3], "c": [2, 2, 1]}, device=self.device
diff --git a/torcharrow/test/test_dataframe_cpu.py b/torcharrow/test/test_dataframe_cpu.py
@@ -64,6 +64,12 @@ def test_isin2(self):
     def test_describe_dataframe(self):
         return self.base_test_describe_dataframe()
 
+    def test_drop_by_str_as_columns(self):
+        return self.base_test_drop_by_str_as_columns()
+
+    def test_drop_by_list_of_str_as_columns(self):
+        return self.base_test_drop_by_list_of_str_as_columns()
+
     def test_drop_keep_rename_reorder_pipe(self):
         return self.base_test_drop_keep_rename_reorder_pipe()
 
@@ -73,6 +79,12 @@ def test_me_on_str(self):
     def test_locals_and_me_equivalence(self):
         return self.base_test_locals_and_me_equivalence()
 
+    def test_groupby_str(self):
+        return self.base_test_groupby_str()
+
+    def test_groupby_list_of_str(self):
+        return self.base_test_groupby_list_of_str()
+
     def test_groupby_size_pipe(self):
         return self.base_test_groupby_size_pipe()
 
diff --git a/torcharrow/velox_rt/dataframe_cpu.py b/torcharrow/velox_rt/dataframe_cpu.py
@@ -308,6 +308,9 @@ def append(self, values: Iterable[Union[None, dict, tuple]]):
             return self
 
     def _check_columns(self, columns: Iterable[str]):
+        if isinstance(columns, str):
+            raise TypeError(f"columns should be Iterable of str but not str")
+
         valid_names = {f.name for f in self.dtype.fields}
         for n in columns:
             if n not in valid_names:
@@ -1597,11 +1600,13 @@ def drop_null(self, how="any"):
     @expression
     def drop_duplicates(
         self,
-        subset: Optional[List[str]] = None,
+        subset: Optional[Union[str, List[str]]] = None,
         keep="first",
     ):
         self._prototype_support_warning("drop_duplicates")
 
+        if isinstance(subset, str):
+            subset = [subset]
         columns = subset if subset is not None else self.columns
         self._check_columns(columns)
 
@@ -1857,7 +1862,9 @@ def describe(
 
     @trace
     @expression
-    def drop(self, columns: List[str]):
+    def drop(self, columns: Union[str, List[str]]):
+        if isinstance(columns, str):
+            columns = [columns]
         self._check_columns(columns)
         return self._fromdata(
             {
@@ -2105,7 +2112,7 @@ def pipe(self, func, *args, **kwargs):
     @expression
     def groupby(
         self,
-        by: List[str],
+        by: Union[str, List[str]],
         sort=False,
         drop_null=True,
     ):
@@ -2181,6 +2188,8 @@ def groupby(
         # TODO implement
         assert not sort
         assert drop_null
+        if isinstance(by, str):
+            by = [by]
         self._check_columns(by)
 
         key_columns = by