Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,9 @@ http_archive(

http_archive(
name = "org_tensorflow",
strip_prefix = "tensorflow-40998f44c0c500ce0f6e3b1658dfbc54f838a82a",
sha256 = "5a5bc4599964c71277dcac0d687435291e5810d2ac2f6283cc96736febf73aaf",
strip_prefix = "tensorflow-2.20.0",
urls = [
"https://github.com/tensorflow/tensorflow/archive/40998f44c0c500ce0f6e3b1658dfbc54f838a82a.zip"
"https://github.com/tensorflow/tensorflow/archive/v2.20.0.zip"
],
)

Expand Down
4 changes: 3 additions & 1 deletion oss_scripts/configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ else
if [[ "$IS_NIGHTLY" == "nightly" ]]; then
pip install tf-nightly
else
pip install tensorflow==2.18.0
pip install tensorflow==2.20.0
fi
fi

Expand Down Expand Up @@ -85,3 +85,5 @@ if is_macos; then
fi

write_action_env_to_bazelrc "TF_CXX11_ABI_FLAG" ${TF_ABIFLAG}
#write_to_bazelrc "build --define=TENSORFLOW_TEXT_BUILD_TFLITE_OPS=1"
#write_to_bazelrc "build --define=with_tflite_ops=true"
6 changes: 2 additions & 4 deletions oss_scripts/pip_package/requirements.in
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
setuptools==70.0.0
setuptools==78.1.1
dm-tree==0.1.8 # Limit for macos support.
numpy
protobuf==4.25.3 # b/397977335 - Fix crash on python 3.9, 3.10.
tensorflow
tf-keras
tensorflow-datasets
tensorflow-metadata
tensorflow-datasets
2 changes: 1 addition & 1 deletion oss_scripts/pip_package/setup.nightly.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from setuptools.dist import Distribution

project_name = 'tensorflow-text-nightly'
project_version = 'REPLACE_ME'
project_version = '2.20.0'


class BinaryDistribution(Distribution):
Expand Down
8 changes: 4 additions & 4 deletions oss_scripts/pip_package/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from setuptools.dist import Distribution

project_name = 'tensorflow-text'
project_version = '2.18.0'
project_version = '2.20.0'


class BinaryDistribution(Distribution):
Expand Down Expand Up @@ -81,19 +81,19 @@ def finalize_options(self):
author_email='packages@tensorflow.org',
url='http://github.com/tensorflow/text',
license='Apache 2.0',
packages=find_packages(),
packages=find_packages() + ['tensorflow_text.core.pybinds'],
include_package_data=True,
zip_safe=False,
cmdclass={'install': InstallPlatlib},
distclass=BinaryDistribution,
install_requires=[
(
'tensorflow>=2.18.0, <2.19',
'tensorflow==2.20',
),
],
extras_require={
'tensorflow_cpu': [
'tensorflow-cpu>=2.18.0, <2.19',
'tensorflow-cpu==2.20',
],
'tests': [
'absl-py',
Expand Down
10 changes: 5 additions & 5 deletions oss_scripts/prepare_tf_dep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ else
fi

# Update TF dependency to installed tensorflow.
echo "Updating WORKSPACE file to use TensorFlow commit $commit_slug"
sed -E -i $ext "s/strip_prefix = \"tensorflow-.+\",/strip_prefix = \"tensorflow-${commit_slug}\",/" WORKSPACE
sed -E -i $ext "s|\"https://github.com/tensorflow/tensorflow/archive/.+\.zip\"|\"https://github.com/tensorflow/tensorflow/archive/${commit_slug}.zip\"|" WORKSPACE
prev_shasum=$(grep -A 1 -e "strip_prefix.*tensorflow-" WORKSPACE | tail -1 | awk -F '"' '{print $2}')
sed -i $ext "s/sha256 = \"${prev_shasum}\",//" WORKSPACE
# echo "Updating WORKSPACE file to use TensorFlow commit $commit_slug"
# sed -E -i $ext "s/strip_prefix = \"tensorflow-.+\",/strip_prefix = \"tensorflow-${commit_slug}\",/" WORKSPACE
# sed -E -i $ext "s|\"https://github.com/tensorflow/tensorflow/archive/.+\.zip\"|\"https://github.com/tensorflow/tensorflow/archive/${commit_slug}.zip\"|" WORKSPACE
# prev_shasum=$(grep -A 1 -e "strip_prefix.*tensorflow-" WORKSPACE | tail -1 | awk -F '"' '{print $2}')
# sed -i $ext "s/sha256 = \"${prev_shasum}\",//" WORKSPACE
2 changes: 1 addition & 1 deletion oss_scripts/run_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ if [[ $osname != "darwin" ]] || [[ ! $(sysctl -n machdep.cpu.brand_string) =~ "A
fi

# Build the pip package.
bazel run ${BUILD_ARGS[@]} --enable_runfiles //oss_scripts/pip_package:build_pip_package -- "$(realpath .)"
bazel run --jobs=6 ${BUILD_ARGS[@]} --enable_runfiles //oss_scripts/pip_package:build_pip_package -- "$(realpath .)"

if [ -n "${AUDITWHEEL_PLATFORM}" ]; then
echo $(date) : "=== Auditing wheel"
Expand Down
3 changes: 2 additions & 1 deletion tensorflow_text/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ py_library(
"python/keras/layers/__init__.py",
"python/metrics/__init__.py",
"python/numpy/__init__.py",
"//tensorflow_text/core/pybinds:__init__.py",
"python/ops/__init__.py",
"tools/__init__.py",
],
Expand Down Expand Up @@ -123,7 +124,7 @@ py_library(
":wordpiece_tokenizer",
":wordshape_ops",
# python/util:all_util tensorflow dep,
"//tensorflow_text/core/pybinds:tflite_registrar",
"//tensorflow_text/core",
"//tensorflow_text/tools/wordpiece_vocab",
] + extra_py_deps(),
)
Expand Down
2 changes: 1 addition & 1 deletion tensorflow_text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,4 @@
]

remove_undocumented(__name__, _allowed_symbols)
__version__ = "2.13.0"
__version__ = "2.20.0"
15 changes: 15 additions & 0 deletions tensorflow_text/core/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package(default_visibility = ["//visibility:public"])

licenses(["notice"])

py_library(
name = "core",
deps = [
"//tensorflow_text/core/pybinds:pywrap_fast_bert_normalizer_model_builder",
"//tensorflow_text/core/pybinds:pywrap_fast_wordpiece_tokenizer_model_builder",
"//tensorflow_text/core/pybinds:pywrap_model_converter",
"//tensorflow_text/core/pybinds:pywrap_phrase_tokenizer_model_builder",
"//tensorflow_text/core/pybinds:pywrap_whitespace_tokenizer_config_builder",
"//tensorflow_text/core/pybinds:tflite_registrar",
],
)
Empty file.
5 changes: 5 additions & 0 deletions tensorflow_text/core/kernels/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ cc_library(
deps = [
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings",
"@darts_clone",
],
)
Expand Down Expand Up @@ -873,6 +874,10 @@ tf_cc_library(
deps = [
"@com_google_absl//absl/strings",
"@icu//:common",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/base:core_headers",
"@icu//:nfkc",
],
)

Expand Down
1 change: 1 addition & 0 deletions tensorflow_text/core/kernels/darts_clone_trie_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <numeric>

#include "absl/container/flat_hash_set.h"
#include "absl/strings/str_cat.h"
#include "include/darts.h"

namespace tensorflow {
Expand Down
63 changes: 36 additions & 27 deletions tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,53 +14,62 @@

#include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h"

#include <iterator>
#include <string>

#include "icu4c/source/common/unicode/appendable.h"
#include "icu4c/source/common/unicode/bytestream.h"
#include "icu4c/source/common/unicode/edits.h"
#include "icu4c/source/common/unicode/normalizer2.h"
#include "icu4c/source/common/unicode/schriter.h"
#include "icu4c/source/common/unicode/stringoptions.h"
#include "icu4c/source/common/unicode/stringpiece.h"
#include "icu4c/source/common/unicode/uchar.h"
#include "icu4c/source/common/unicode/ucnv.h"
#include "icu4c/source/common/unicode/ucnv_err.h"
#include "icu4c/source/common/unicode/umachine.h"
#include "icu4c/source/common/unicode/uniset.h"
#include "icu4c/source/common/unicode/unistr.h"
#include "icu4c/source/common/unicode/uset.h"
#include "icu4c/source/common/unicode/utf.h"
#include "icu4c/source/common/unicode/utf8.h"
#include "icu4c/source/common/unicode/utypes.h"

namespace tensorflow {
namespace text {

namespace {

const icu::UnicodeSet& WhiteSpaceSet() {
// Will not fail because the data is hardcoded in the ICU library.
UErrorCode error_code = U_ZERO_ERROR;
const USet* c_set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &error_code);
// assert(U_SUCCESS(error_code));
const icu::UnicodeSet* set = icu::UnicodeSet::fromUSet(c_set);
return *set;
}

} // namespace

std::string BuildWhitespaceString() {
std::string str;
char buf[U8_MAX_LENGTH];
for (auto cp : WhiteSpaceSet().codePoints()) {
int len = 0;
U8_APPEND_UNSAFE(buf, len, cp);
str.append(buf, len);
icu::UnicodeString unicode_string;
icu::UnicodeStringAppendable appendable_unicode_string(unicode_string);
// The maximum codepoint in Unicode is 0x0010FFFF.
for (UChar32 cp = 0; cp <= 0x0010FFFF; ++cp) {
if (U_IS_UNICODE_CHAR(cp) && u_isUWhiteSpace(cp)) {
appendable_unicode_string.appendCodePoint(cp);
}
}
std::string str;
unicode_string.toUTF8String(str);
return str;
}

std::string BuildWhitespaceTokenizerConfig() {
const icu::UnicodeSet& set = WhiteSpaceSet();
int range_count = set.getRangeCount();
UChar32 largest_whitespace = set.getRangeEnd(range_count - 1);
// The maximum codepoint in Unicode is 0x0010FFFF.
UChar32 max_unicode_char = 0x0010FFFF;
// The string will hold our bit array
std::string bitset((largest_whitespace >> 3) + 1, 0);
for (auto cp : set.codePoints()) {
int index = cp >> 3;
bitset[index] |= 1 << (cp & 7);
std::string bitset((max_unicode_char >> 3) + 1, 0);
auto bitdata = bitset.begin();
UChar32 largest_whitespace = 0;
int shift = 0;
for (UChar32 cp = 0; cp <= max_unicode_char; ++cp, ++shift) {
if (shift == 8) {
++bitdata;
shift = 0;
}
bool is_whitespace = U_IS_UNICODE_CHAR(cp) && u_isUWhiteSpace(cp);
largest_whitespace = is_whitespace ? cp : largest_whitespace;
*bitdata |= is_whitespace << shift;
}
return bitset;
return bitset.substr(0, (largest_whitespace >> 3) + 1);
}

} // namespace text
Expand Down
Loading