@@ -45,9 +45,10 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
4545 fun makeTokenizer (
4646 mode : SplitMode ,
4747 noPunctuation : Boolean = true,
48+ allowEmptyMorpheme : Boolean = false,
4849 capacity : Int = 0
4950 ): SudachiTokenizer {
50- val dict = ReloadableDictionary (config)
51+ val dict = ReloadableDictionary (config.allowEmptyMorpheme(allowEmptyMorpheme) )
5152 val extractor =
5253 if (capacity == 0 ) {
5354 NoopInputExtractor .INSTANCE
@@ -113,7 +114,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
113114
114115 @Test
115116 fun incrementTokenByPunctuationMode () {
116- val tokenizer = makeTokenizer(SplitMode .C , false )
117+ val tokenizer = makeTokenizer(SplitMode .C , noPunctuation = false )
117118 tokenizer.setReader(StringReader (" 東京都に行った。" ))
118119 assertTokenStreamContents(
119120 tokenizer,
@@ -128,7 +129,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
128129
129130 @Test
130131 fun incrementTokenWithPunctuationsByDefaultMode () {
131- val tokenizer = makeTokenizer(SplitMode .C , true )
132+ val tokenizer = makeTokenizer(SplitMode .C , noPunctuation = true )
132133 tokenizer.setReader(StringReader (" 東京都に行った。東京都に行った。" ))
133134 assertTokenStreamContents(
134135 tokenizer,
@@ -143,7 +144,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
143144
144145 @Test
145146 fun incrementTokenWithPunctuationsByPunctuationMode () {
146- val tokenizer = makeTokenizer(SplitMode .C , false )
147+ val tokenizer = makeTokenizer(SplitMode .C , noPunctuation = false )
147148 tokenizer.setReader(StringReader (" 東京都に行った。東京都に行った。" ))
148149 assertTokenStreamContents(
149150 tokenizer,
@@ -158,7 +159,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
158159
159160 @Test
160161 fun incrementTokenWithPunctuationsByPunctuationModeCached () {
161- val tokenizer = makeTokenizer(SplitMode .C , false , capacity = 10 )
162+ val tokenizer = makeTokenizer(SplitMode .C , noPunctuation = false , capacity = 10 )
162163 tokenizer.setReader(StringReader (" 東京都に行った。東京都に行った。" ))
163164 assertTokenStreamContents(
164165 tokenizer,
@@ -173,7 +174,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
173174
174175 @Test
175176 fun incrementTokenWithOOVByDefaultMode () {
176- val tokenizer = makeTokenizer(SplitMode .C , true )
177+ val tokenizer = makeTokenizer(SplitMode .C )
177178 tokenizer.setReader(StringReader (" アマゾンに行った。" ))
178179 assertTokenStreamContents(
179180 tokenizer,
@@ -188,7 +189,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
188189
189190 @Test
190191 fun incrementTokenWithOOVByPunctuationMode () {
191- val tokenizerPunctuation = makeTokenizer(SplitMode .C , false )
192+ val tokenizerPunctuation = makeTokenizer(SplitMode .C , noPunctuation = false )
192193 tokenizerPunctuation.setReader(StringReader (" アマゾンに行った。" ))
193194 assertTokenStreamContents(
194195 tokenizerPunctuation,
@@ -203,7 +204,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
203204
204205 @Test
205206 fun incrementTokenByAMode () {
206- val tokenizerA = makeTokenizer(SplitMode .A , true )
207+ val tokenizerA = makeTokenizer(SplitMode .A )
207208 tokenizerA.setReader(StringReader (" 東京都に行った。" ))
208209 assertTokenStreamContents(
209210 tokenizerA,
@@ -218,7 +219,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
218219
219220 @Test
220221 fun incrementTokenByBMode () {
221- val tokenizerB = makeTokenizer(SplitMode .B , true )
222+ val tokenizerB = makeTokenizer(SplitMode .B )
222223 tokenizerB.setReader(StringReader (" 東京都に行った。" ))
223224 assertTokenStreamContents(
224225 tokenizerB,
@@ -236,7 +237,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
236237 val builder = NormalizeCharMap .Builder ()
237238 builder.add(" 東京都" , " 京都" )
238239 val filter = MappingCharFilter (builder.build(), StringReader (" 東京都に行った。" ))
239- val tokenizer = makeTokenizer(SplitMode .C , true )
240+ val tokenizer = makeTokenizer(SplitMode .C )
240241 tokenizer.setReader(filter)
241242 assertTokenStreamContents(
242243 tokenizer,
@@ -249,9 +250,57 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
249250 )
250251 }
251252
253+ @Test
254+ fun incrementTokenWithCorrectSplitOffset () {
255+ val builder = NormalizeCharMap .Builder ()
256+ builder.add(" (株)" , " 株式会社" )
257+ val filter = MappingCharFilter (builder.build(), StringReader (" (株)に行った。" ))
258+ val tokenizer = makeTokenizer(SplitMode .A )
259+ tokenizer.setReader(filter)
260+ assertTokenStreamContents(
261+ tokenizer,
262+ arrayOf(" 株式" , " 会社" , " に" , " 行っ" , " た" ),
263+ intArrayOf(0 , 2 , 3 , 4 , 6 ),
264+ intArrayOf(2 , 3 , 4 , 6 , 7 ),
265+ intArrayOf(1 , 1 , 1 , 1 , 1 ),
266+ intArrayOf(1 , 1 , 1 , 1 , 1 ),
267+ 8 ,
268+ )
269+ }
270+
271+ @Test
272+ fun incrementTokenWithDisallowEmptyMorpheme () {
273+ val tokenizer = makeTokenizer(SplitMode .A , allowEmptyMorpheme = false )
274+ tokenizer.setReader(StringReader (" ㍿に行った。" ))
275+ assertTokenStreamContents(
276+ tokenizer,
277+ arrayOf(" ㍿" , " ㍿" , " に" , " 行っ" , " た" ),
278+ intArrayOf(0 , 0 , 1 , 2 , 4 ),
279+ intArrayOf(1 , 1 , 2 , 4 , 5 ),
280+ intArrayOf(1 , 1 , 1 , 1 , 1 ),
281+ intArrayOf(1 , 1 , 1 , 1 , 1 ),
282+ 6 ,
283+ )
284+ }
285+
286+ @Test
287+ fun incrementTokenWithAllowEmptyMorpheme () {
288+ val tokenizer = makeTokenizer(SplitMode .A , allowEmptyMorpheme = true )
289+ tokenizer.setReader(StringReader (" ㍿に行った。" ))
290+ assertTokenStreamContents(
291+ tokenizer,
292+ arrayOf(" ㍿" , " " , " に" , " 行っ" , " た" ),
293+ intArrayOf(0 , 1 , 1 , 2 , 4 ),
294+ intArrayOf(1 , 1 , 2 , 4 , 5 ),
295+ intArrayOf(1 , 1 , 1 , 1 , 1 ),
296+ intArrayOf(1 , 1 , 1 , 1 , 1 ),
297+ 6 ,
298+ )
299+ }
300+
252301 @Test
253302 fun additionalSettings () {
254- val tokenizer = makeTokenizer(SplitMode .C , true )
303+ val tokenizer = makeTokenizer(SplitMode .C )
255304 tokenizer.setReader(StringReader (" 自然言語" ))
256305 assertTokenStreamContents(
257306 tokenizer,
@@ -268,7 +317,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
268317 config =
269318 Config .fromClasspath(ResourceUtil ::class .java.getResource(" additional.json" ), anchor)
270319 .withFallback(config)
271- val tokenizer2 = makeTokenizer(SplitMode .C , true )
320+ val tokenizer2 = makeTokenizer(SplitMode .C )
272321 tokenizer2.setReader(StringReader (" 自然言語" ))
273322 assertTokenStreamContents(
274323 tokenizer2,
@@ -283,8 +332,8 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
283332
284333 @Test
285334 fun equalsHashCodeCoverage () {
286- val tokenizerA = makeTokenizer(SplitMode .A , true )
287- val tokenizerB = makeTokenizer(SplitMode .B , true )
335+ val tokenizerA = makeTokenizer(SplitMode .A )
336+ val tokenizerB = makeTokenizer(SplitMode .B )
288337 assertNotEquals(tokenizerA, tokenizerB)
289338 assertNotEquals(tokenizerA.hashCode().toLong(), tokenizerB.hashCode().toLong())
290339 }
0 commit comments