@@ -54,7 +54,6 @@ describe("Tokenizer padding/truncation", () => {
5454 } , MAX_TOKENIZER_LOAD_TIME ) ;
5555
5656 describe ( "return_tensor=false (jagged array)" , ( ) => {
57-
5857 test ( "jagged array output when return_tensor is false" , ( ) => {
5958 const output = tokenizer ( inputs , {
6059 return_tensor : false ,
@@ -105,7 +104,6 @@ describe("Tokenizer padding/truncation", () => {
105104 compare ( output , expected ) ;
106105 } ) ;
107106
108-
109107 test ( "No padding, max_length=3 (implicit truncation strategy)" , ( ) => {
110108 const output = tokenizer ( inputs_2 , {
111109 padding : false ,
@@ -129,9 +127,18 @@ describe("Tokenizer padding/truncation", () => {
129127 return_tensor : false ,
130128 } ) ;
131129 const expected = {
132- input_ids : [ [ 1037 , 0 , 0 , 0 , 0 ] , [ 1038 , 1039 , 1040 , 1041 , 1042 ] ] ,
133- token_type_ids : [ [ 0 , 0 , 0 , 0 , 0 ] , [ 0 , 0 , 0 , 0 , 0 ] ] ,
134- attention_mask : [ [ 1 , 0 , 0 , 0 , 0 ] , [ 1 , 1 , 1 , 1 , 1 ] ] ,
130+ input_ids : [
131+ [ 1037 , 0 , 0 , 0 , 0 ] ,
132+ [ 1038 , 1039 , 1040 , 1041 , 1042 ] ,
133+ ] ,
134+ token_type_ids : [
135+ [ 0 , 0 , 0 , 0 , 0 ] ,
136+ [ 0 , 0 , 0 , 0 , 0 ] ,
137+ ] ,
138+ attention_mask : [
139+ [ 1 , 0 , 0 , 0 , 0 ] ,
140+ [ 1 , 1 , 1 , 1 , 1 ] ,
141+ ] ,
135142 } ;
136143 compare ( output , expected ) ;
137144 } ) ;
@@ -161,48 +168,75 @@ describe("Tokenizer padding/truncation", () => {
161168 return_tensor : false ,
162169 } ) ;
163170 const expected = {
164- input_ids : [ [ 1037 , 0 , 0 ] , [ 1038 , 1039 , 1040 ] ] ,
165- token_type_ids : [ [ 0 , 0 , 0 ] , [ 0 , 0 , 0 ] ] ,
166- attention_mask : [ [ 1 , 0 , 0 ] , [ 1 , 1 , 1 ] ] ,
171+ input_ids : [
172+ [ 1037 , 0 , 0 ] ,
173+ [ 1038 , 1039 , 1040 ] ,
174+ ] ,
175+ token_type_ids : [
176+ [ 0 , 0 , 0 ] ,
177+ [ 0 , 0 , 0 ] ,
178+ ] ,
179+ attention_mask : [
180+ [ 1 , 0 , 0 ] ,
181+ [ 1 , 1 , 1 ] ,
182+ ] ,
167183 } ;
168184 compare ( output , expected ) ;
169185 } ) ;
170186
171187 test ( "Padding 'max_length' without truncation, max_length=3" , ( ) => {
172188 const output = tokenizer ( inputs_2 , {
173- padding : ' max_length' ,
189+ padding : " max_length" ,
174190 truncation : false ,
175191 max_length : 3 ,
176192 add_special_tokens : false ,
177193 return_tensor : false ,
178194 } ) ;
179195 const expected = {
180- input_ids : [ [ 1037 , 0 , 0 ] , [ 1038 , 1039 , 1040 , 1041 , 1042 ] ] ,
181- token_type_ids : [ [ 0 , 0 , 0 ] , [ 0 , 0 , 0 , 0 , 0 ] ] ,
182- attention_mask : [ [ 1 , 0 , 0 ] , [ 1 , 1 , 1 , 1 , 1 ] ] ,
196+ input_ids : [
197+ [ 1037 , 0 , 0 ] ,
198+ [ 1038 , 1039 , 1040 , 1041 , 1042 ] ,
199+ ] ,
200+ token_type_ids : [
201+ [ 0 , 0 , 0 ] ,
202+ [ 0 , 0 , 0 , 0 , 0 ] ,
203+ ] ,
204+ attention_mask : [
205+ [ 1 , 0 , 0 ] ,
206+ [ 1 , 1 , 1 , 1 , 1 ] ,
207+ ] ,
183208 } ;
184209 compare ( output , expected ) ;
185210 } ) ;
186211
187212 test ( "Padding 'max_length' with truncation, max_length=3" , ( ) => {
188213 const output = tokenizer ( inputs_2 , {
189- padding : ' max_length' ,
214+ padding : " max_length" ,
190215 truncation : true ,
191216 max_length : 3 ,
192217 add_special_tokens : false ,
193218 return_tensor : false ,
194219 } ) ;
195220 const expected = {
196- input_ids : [ [ 1037 , 0 , 0 ] , [ 1038 , 1039 , 1040 ] ] ,
197- token_type_ids : [ [ 0 , 0 , 0 ] , [ 0 , 0 , 0 ] ] ,
198- attention_mask : [ [ 1 , 0 , 0 ] , [ 1 , 1 , 1 ] ] ,
221+ input_ids : [
222+ [ 1037 , 0 , 0 ] ,
223+ [ 1038 , 1039 , 1040 ] ,
224+ ] ,
225+ token_type_ids : [
226+ [ 0 , 0 , 0 ] ,
227+ [ 0 , 0 , 0 ] ,
228+ ] ,
229+ attention_mask : [
230+ [ 1 , 0 , 0 ] ,
231+ [ 1 , 1 , 1 ] ,
232+ ] ,
199233 } ;
200234 compare ( output , expected ) ;
201235 } ) ;
202236
203237 test ( "Padding 'max_length' without truncation and max_length=null" , ( ) => {
204238 const output = tokenizer ( inputs_2 , {
205- padding : ' max_length' ,
239+ padding : " max_length" ,
206240 truncation : false ,
207241 max_length : null ,
208242 add_special_tokens : false ,
@@ -211,23 +245,22 @@ describe("Tokenizer padding/truncation", () => {
211245 const expected = {
212246 input_ids : [
213247 [ 1037 , ...Array ( 511 ) . fill ( 0 ) ] ,
214- [ 1038 , 1039 , 1040 , 1041 , 1042 , ...Array ( 507 ) . fill ( 0 ) ]
248+ [ 1038 , 1039 , 1040 , 1041 , 1042 , ...Array ( 507 ) . fill ( 0 ) ] ,
215249 ] ,
216250 token_type_ids : [
217251 [ 0 , ...Array ( 511 ) . fill ( 0 ) ] ,
218- [ 0 , 0 , 0 , 0 , 0 , ...Array ( 507 ) . fill ( 0 ) ]
252+ [ 0 , 0 , 0 , 0 , 0 , ...Array ( 507 ) . fill ( 0 ) ] ,
219253 ] ,
220254 attention_mask : [
221255 [ 1 , ...Array ( 511 ) . fill ( 0 ) ] ,
222- [ 1 , 1 , 1 , 1 , 1 , ...Array ( 507 ) . fill ( 0 ) ]
256+ [ 1 , 1 , 1 , 1 , 1 , ...Array ( 507 ) . fill ( 0 ) ] ,
223257 ] ,
224258 } ;
225259 compare ( output , expected ) ;
226260 } ) ;
227261 } ) ;
228262
229263 describe ( "return_tensor=true" , ( ) => {
230-
231264 test ( "throws error when tensor output is requested for a jagged array" , ( ) => {
232265 expect ( ( ) => tokenizer ( inputs ) ) . toThrow ( "Unable to create tensor" ) ;
233266 } ) ;
@@ -329,7 +362,7 @@ describe("Tokenizer padding/truncation", () => {
329362
330363 test ( "padding:'max_length' pads to the specified max_length" , ( ) => {
331364 const { input_ids, attention_mask, token_type_ids } = tokenizer ( inputs , {
332- padding : ' max_length' ,
365+ padding : " max_length" ,
333366 truncation : true ,
334367 add_special_tokens : false ,
335368 max_length : 3 ,
@@ -347,7 +380,7 @@ describe("Tokenizer padding/truncation", () => {
347380 [ 0n , 0n , 0n ] ,
348381 ] ) ;
349382 } ) ;
350- } )
383+ } ) ;
351384} ) ;
352385
353386describe ( "Token type ids" , ( ) => {
0 commit comments