Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 27c6185

Browse files
Cleanup in text_encoder.py
1 parent 6d4e7b4 commit 27c6185

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

tensor2tensor/data_generators/text_encoder.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,13 @@
3737

3838

3939
# Conversion between Unicode and UTF-8, if required (on Python2)
40-
def native_to_unicode(s):
41-
return s.decode("utf-8") if (PY2 and not isinstance(s, unicode)) else s
42-
unicode_to_native = (lambda s: s.encode("utf-8")) if PY2 else (lambda s: s)
40+
if PY2:
41+
native_to_unicode = lambda s: s if isinstance(s, unicode) else s.decode("utf-8")
42+
unicode_to_native = lambda s: s.encode("utf-8")
43+
else:
44+
# No conversion required on Python3
45+
native_to_unicode = lambda s: s
46+
unicode_to_native = lambda s: s
4347

4448

4549
# Reserved tokens for things like padding and EOS symbols.
@@ -346,7 +350,7 @@ def build_from_token_counts(self,
346350
token_counts,
347351
min_count,
348352
num_iterations=4,
349-
num_reserved_ids=2):
353+
num_reserved_ids=NUM_RESERVED_TOKENS):
350354
"""Train a SubwordTextEncoder based on a dictionary of word counts.
351355
352356
Args:

0 commit comments

Comments
 (0)