Create add_tokens UDF (#384)

Nayef211 · facebook-github-bot · commit 25b0c6cb4be1 · 2022-06-22T18:11:42.000-07:00
Summary: ## Description - Adding and registering UDF for `add_tokens` which can operate on list of `int` and `str` # Testing `pytest torcharrow/test/transformation/test_text_ops.py` Pull Request resolved: #384 Reviewed By: wenleix, parmeet Differential Revision: D37315924 Pulled By: Nayef211 fbshipit-source-id: 81ce4fa6b45ea0681e87e9807a3b2f49337bdd0d
diff --git a/csrc/velox/functions/CMakeLists.txt b/csrc/velox/functions/CMakeLists.txt
@@ -28,11 +28,12 @@ if (USE_TORCH)
   list(
     APPEND
     TORCHARROW_UDF_SOURCES
+    text/add_tokens.h
+    text/bpe_tokenize.h
     text/gpt2_bpe_tokenizer.h
     text/gpt2_bpe_tokenizer.cpp
     text/regex.h
     text/regex.cpp
-    text/bpe_tokenize.h
     text/vocab.h
     text/vocab.cpp
     text/vocab_ops.h
diff --git a/csrc/velox/functions/functions.h b/csrc/velox/functions/functions.h
@@ -16,6 +16,7 @@
 #include "rec/sigrid_hash.h" // @manual
 #include "string_functions.h"
 #ifdef USE_TORCH
+#include "text/add_tokens.h" // @manual
 #include "text/bpe_tokenize.h" // @manual
 #include "text/vocab_ops.h" // @manual
 #endif
@@ -262,6 +263,19 @@ inline void registerTorchArrowFunctions() {
       velox::ArrayWriterT<velox::Varchar>,
       std::shared_ptr<GPT2BPEEncoder>,
       velox::Varchar>({"bpe_tokenize"});
+
+  velox::registerFunction<
+      add_tokens,
+      velox::Array<int64_t>,
+      velox::Array<int64_t>,
+      velox::Array<int64_t>,
+      bool>({"add_tokens"});
+  velox::registerFunction<
+      add_tokens,
+      velox::Array<velox::Varchar>,
+      velox::Array<velox::Varchar>,
+      velox::Array<velox::Varchar>,
+      bool>({"add_tokens"});
 #endif
 
   //   sigrid_hash
diff --git a/csrc/velox/functions/text/add_tokens.h b/csrc/velox/functions/text/add_tokens.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "velox/functions/Udf.h"
+#include "velox/type/Type.h"
+
+namespace facebook::torcharrow::functions {
+template <typename T>
+struct add_tokens {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  template <typename TInput, typename TOutput, typename TTokens>
+  FOLLY_ALWAYS_INLINE void call(
+      TOutput& output,
+      const TInput& input,
+      const TTokens& tokens,
+      bool begin = true) {
+    output.reserve(input.size() + tokens.size());
+    if (begin) {
+      output.add_items(tokens);
+      output.add_items(input);
+    } else {
+      output.add_items(input);
+      output.add_items(tokens);
+    }
+  }
+};
+} // namespace facebook::torcharrow::functions
diff --git a/docs/source/functional.rst b/docs/source/functional.rst
@@ -32,6 +32,15 @@ Here is another example usage of Velox array function `array\_except <https://fa
     3  [2]
     dtype: List(Int64(nullable=True), nullable=True), length: 4, null_count: 0
 
+Text Operations
+-----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: function.rst
+
+    add_tokens
+
 Recommendation Operations
 -----------------------------------
 .. autosummary::
diff --git a/torcharrow/functional.py b/torcharrow/functional.py
@@ -110,6 +110,35 @@ def __getattr__(op_name: str):
     return wrapper
 
 
+### operations in for text domain
+def add_tokens(
+    input_col: Union[ListColumn, List[Union[int, str]]],
+    tokens: Union[ListColumn, List[Union[int, str]]],
+    begin: bool,
+) -> NumericalColumn:
+    """
+    Append or prepend a list of tokens/indices to a column.
+    This is a common operation to add EOS and BOS tokens to text.
+
+    Parameters
+    ----------
+    input_col: List of input tokens/indices
+    tokens: List of tokens/indices to append or prepend
+    begin: Boolean to determine whether to prepend or append the tokens/indices
+
+    Examples
+    --------
+    >>> import torcharrow as ta
+    >>> from torcharrow import functional
+    >>> a = ta.column([[1, 2], [3, 4, 5]])
+    >>> functional.add_tokens(a, [0], begin=True)
+    0  [0, 1, 2]
+    1  [0, 3, 4, 5]
+    dtype: List(Int64(nullable=True), nullable=True), length: 2, null_count: 0
+    """
+    return _dispatch("add_tokens", input_col, tokens, begin)
+
+
 # Velox core functions
 # Not a comprehensive list yet
 def array_constructor(*args) -> ListColumn:
diff --git a/torcharrow/test/transformation/test_text_ops.py b/torcharrow/test/transformation/test_text_ops.py
@@ -115,6 +115,19 @@ def setUpClass(cls):
                 ]
             ),
         )
+
+        cls.base_df_add_token = ta.dataframe(
+            {
+                "text": [["Hello", "world"], ["How", "are", "you!"]],
+                "indices": [[1, 2], [3, 4, 5]],
+            },
+            dtype=dt.Struct(
+                fields=[
+                    dt.Field("text", dt.List(dt.string)),
+                    dt.Field("indices", dt.List(dt.int64)),
+                ]
+            ),
+        )
         cls.setUpTestCaseData()
 
     @classmethod
@@ -141,6 +154,30 @@ def test_vocab_lookup_indices(self):
         out_df = functional.lookup_indices(vocab, self.df_vocab["text"])
         self.assertEqual(indices, list(out_df))
 
+    @unittest.skipUnless(
+        pytorch_available and _ta.is_built_with_torch(), "Requires PyTorch"
+    )
+    def test_add_tokens(self):
+        tokens = [
+            ["<bos>", "Hello", "world", "<eos>"],
+            ["<bos>", "How", "are", "you!", "<eos>"],
+        ]
+        indices = [[0, 1, 2, 6], [0, 3, 4, 5, 6]]
+
+        # adding indices
+        out_indices = functional.add_tokens(
+            self.base_df_add_token["indices"], [0], begin=True
+        )
+        out_indices = functional.add_tokens(out_indices, [6], begin=False)
+        self.assertEqual(indices, list(out_indices))
+
+        # adding tokens
+        out_tokens = functional.add_tokens(
+            self.base_df_add_token["text"], ["<bos>"], begin=True
+        )
+        out_tokens = functional.add_tokens(out_tokens, ["<eos>"], begin=False)
+        self.assertEqual(tokens, list(out_tokens))
+
 
 class TestTextOpsCpu(_TestTextOpsBase):
     @classmethod