Skip to content
This repository was archived by the owner on Nov 1, 2024. It is now read-only.

Commit 25b0c6c

Browse files
Nayef211facebook-github-bot
authored andcommitted
Create add_tokens UDF (#384)
Summary: ## Description - Adding and registering UDF for `add_tokens` which can operate on list of `int` and `str` # Testing `pytest torcharrow/test/transformation/test_text_ops.py` Pull Request resolved: #384 Reviewed By: wenleix, parmeet Differential Revision: D37315924 Pulled By: Nayef211 fbshipit-source-id: 81ce4fa6b45ea0681e87e9807a3b2f49337bdd0d
1 parent 5a8d58f commit 25b0c6c

File tree

6 files changed

+126
-1
lines changed

6 files changed

+126
-1
lines changed

csrc/velox/functions/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ if (USE_TORCH)
2828
list(
2929
APPEND
3030
TORCHARROW_UDF_SOURCES
31+
text/add_tokens.h
32+
text/bpe_tokenize.h
3133
text/gpt2_bpe_tokenizer.h
3234
text/gpt2_bpe_tokenizer.cpp
3335
text/regex.h
3436
text/regex.cpp
35-
text/bpe_tokenize.h
3637
text/vocab.h
3738
text/vocab.cpp
3839
text/vocab_ops.h

csrc/velox/functions/functions.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "rec/sigrid_hash.h" // @manual
1717
#include "string_functions.h"
1818
#ifdef USE_TORCH
19+
#include "text/add_tokens.h" // @manual
1920
#include "text/bpe_tokenize.h" // @manual
2021
#include "text/vocab_ops.h" // @manual
2122
#endif
@@ -262,6 +263,19 @@ inline void registerTorchArrowFunctions() {
262263
velox::ArrayWriterT<velox::Varchar>,
263264
std::shared_ptr<GPT2BPEEncoder>,
264265
velox::Varchar>({"bpe_tokenize"});
266+
267+
velox::registerFunction<
268+
add_tokens,
269+
velox::Array<int64_t>,
270+
velox::Array<int64_t>,
271+
velox::Array<int64_t>,
272+
bool>({"add_tokens"});
273+
velox::registerFunction<
274+
add_tokens,
275+
velox::Array<velox::Varchar>,
276+
velox::Array<velox::Varchar>,
277+
velox::Array<velox::Varchar>,
278+
bool>({"add_tokens"});
265279
#endif
266280

267281
// sigrid_hash
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include "velox/functions/Udf.h"
12+
#include "velox/type/Type.h"
13+
14+
namespace facebook::torcharrow::functions {
15+
template <typename T>
16+
struct add_tokens {
17+
VELOX_DEFINE_FUNCTION_TYPES(T);
18+
19+
template <typename TInput, typename TOutput, typename TTokens>
20+
FOLLY_ALWAYS_INLINE void call(
21+
TOutput& output,
22+
const TInput& input,
23+
const TTokens& tokens,
24+
bool begin = true) {
25+
output.reserve(input.size() + tokens.size());
26+
if (begin) {
27+
output.add_items(tokens);
28+
output.add_items(input);
29+
} else {
30+
output.add_items(input);
31+
output.add_items(tokens);
32+
}
33+
}
34+
};
35+
} // namespace facebook::torcharrow::functions

docs/source/functional.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,15 @@ Here is another example usage of Velox array function `array\_except <https://fa
3232
3 [2]
3333
dtype: List(Int64(nullable=True), nullable=True), length: 4, null_count: 0
3434

35+
Text Operations
36+
-----------------------------------
37+
.. autosummary::
38+
:toctree: generated
39+
:nosignatures:
40+
:template: function.rst
41+
42+
add_tokens
43+
3544
Recommendation Operations
3645
-----------------------------------
3746
.. autosummary::

torcharrow/functional.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,35 @@ def __getattr__(op_name: str):
110110
return wrapper
111111

112112

113+
### operations in for text domain
114+
def add_tokens(
115+
input_col: Union[ListColumn, List[Union[int, str]]],
116+
tokens: Union[ListColumn, List[Union[int, str]]],
117+
begin: bool,
118+
) -> NumericalColumn:
119+
"""
120+
Append or prepend a list of tokens/indices to a column.
121+
This is a common operation to add EOS and BOS tokens to text.
122+
123+
Parameters
124+
----------
125+
input_col: List of input tokens/indices
126+
tokens: List of tokens/indices to append or prepend
127+
begin: Boolean to determine whether to prepend or append the tokens/indices
128+
129+
Examples
130+
--------
131+
>>> import torcharrow as ta
132+
>>> from torcharrow import functional
133+
>>> a = ta.column([[1, 2], [3, 4, 5]])
134+
>>> functional.add_tokens(a, [0], begin=True)
135+
0 [0, 1, 2]
136+
1 [0, 3, 4, 5]
137+
dtype: List(Int64(nullable=True), nullable=True), length: 2, null_count: 0
138+
"""
139+
return _dispatch("add_tokens", input_col, tokens, begin)
140+
141+
113142
# Velox core functions
114143
# Not a comprehensive list yet
115144
def array_constructor(*args) -> ListColumn:

torcharrow/test/transformation/test_text_ops.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,19 @@ def setUpClass(cls):
115115
]
116116
),
117117
)
118+
119+
cls.base_df_add_token = ta.dataframe(
120+
{
121+
"text": [["Hello", "world"], ["How", "are", "you!"]],
122+
"indices": [[1, 2], [3, 4, 5]],
123+
},
124+
dtype=dt.Struct(
125+
fields=[
126+
dt.Field("text", dt.List(dt.string)),
127+
dt.Field("indices", dt.List(dt.int64)),
128+
]
129+
),
130+
)
118131
cls.setUpTestCaseData()
119132

120133
@classmethod
@@ -141,6 +154,30 @@ def test_vocab_lookup_indices(self):
141154
out_df = functional.lookup_indices(vocab, self.df_vocab["text"])
142155
self.assertEqual(indices, list(out_df))
143156

157+
@unittest.skipUnless(
158+
pytorch_available and _ta.is_built_with_torch(), "Requires PyTorch"
159+
)
160+
def test_add_tokens(self):
161+
tokens = [
162+
["<bos>", "Hello", "world", "<eos>"],
163+
["<bos>", "How", "are", "you!", "<eos>"],
164+
]
165+
indices = [[0, 1, 2, 6], [0, 3, 4, 5, 6]]
166+
167+
# adding indices
168+
out_indices = functional.add_tokens(
169+
self.base_df_add_token["indices"], [0], begin=True
170+
)
171+
out_indices = functional.add_tokens(out_indices, [6], begin=False)
172+
self.assertEqual(indices, list(out_indices))
173+
174+
# adding tokens
175+
out_tokens = functional.add_tokens(
176+
self.base_df_add_token["text"], ["<bos>"], begin=True
177+
)
178+
out_tokens = functional.add_tokens(out_tokens, ["<eos>"], begin=False)
179+
self.assertEqual(tokens, list(out_tokens))
180+
144181

145182
class TestTextOpsCpu(_TestTextOpsBase):
146183
@classmethod

0 commit comments

Comments
 (0)