2323import com .worksap .nlp .lucene .sudachi .ja .attributes .*;
2424import com .worksap .nlp .lucene .sudachi .ja .util .Strings ;
2525import com .worksap .nlp .sudachi .Morpheme ;
26-
2726import com .worksap .nlp .sudachi .Tokenizer ;
2827import org .apache .lucene .analysis .TokenFilter ;
2928import org .apache .lucene .analysis .TokenStream ;
@@ -40,56 +39,18 @@ public enum Mode {
4039
4140 public static final Mode DEFAULT_MODE = Mode .SEARCH ;
4241
43- static class OovChars {
44- private int length ;
45- private char [] buffer = new char [0 ];
46- private int reserved ;
47- private int index ;
48- private int baseOffset ;
49-
50- public void setOov (int offset , char [] src , int length ) {
51- baseOffset = offset ;
52- this .length = length ;
53- if (reserved < length ) {
54- buffer = new char [length ];
55- reserved = length ;
56- }
57- System .arraycopy (src , 0 , buffer , 0 , length );
58- index = 0 ;
59- }
60-
61- public boolean hasNext () {
62- return index < length ;
63- }
64-
65- public char next () {
66- if (index < length ) {
67- return buffer [index ++];
68- } else {
69- throw new IllegalStateException ();
70- }
71- }
72-
73- public int index () {
74- return index ;
75- }
76-
77- public int offset () {
78- return baseOffset + index ;
79- }
80- }
81-
8242 private final Mode mode ;
8343 private final Tokenizer .SplitMode splitMode ;
44+
8445 private final CharTermAttribute termAtt ;
8546 private final OffsetAttribute offsetAtt ;
8647 private final PositionIncrementAttribute posIncAtt ;
8748 private final PositionLengthAttribute posLengthAtt ;
8849 private final MorphemeAttribute morphemeAtt ;
89- private ListIterator <Morpheme > aUnitIterator ;
90- private final OovChars oovChars = new OovChars ();
9150
92- private int aUnitOffset = 0 ;
51+ private final MorphemeSubunits subunits = new MorphemeSubunits ();
52+ private final OovChars oovChars = new OovChars ();
53+ private List <Integer > offsetMap ;
9354
9455 public SudachiSplitFilter (TokenStream input , Mode mode , Tokenizer .SplitMode splitMode ) {
9556 super (input );
@@ -105,72 +66,174 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli
10566
10667 @ Override
10768 public final boolean incrementToken () throws IOException {
69+ // continue to write current split
10870 if (oovChars .hasNext ()) {
10971 clearAttributes ();
11072 setOOVAttribute ();
11173 return true ;
11274 }
113- if (aUnitIterator != null && aUnitIterator .hasNext ()) {
75+ if (subunits .hasNext ()) {
11476 clearAttributes ();
115- setAUnitAttribute (aUnitIterator .next ());
77+ setAUnitAttribute ();
78+ return true ;
79+ }
80+
81+ // move to next morpheme
82+ if (!input .incrementToken ()) {
83+ return false ;
84+ }
85+
86+ Morpheme m = morphemeAtt .getMorpheme ();
87+ this .offsetMap = morphemeAtt .getOffsets ();
88+ if (m == null ) {
11689 return true ;
11790 }
11891
119- if (input .incrementToken ()) {
92+ // oov does not have splits
93+ // split into characters in extended mode
94+ if (m .isOOV ()) {
12095 int length = 0 ;
121- Morpheme m = morphemeAtt .getMorpheme ();
122- if (m == null ) {
123- return true ;
124- }
125- termAtt .setEmpty ().append (m .surface ());
126- if (mode == Mode .EXTENDED && m .isOOV () && (length = Strings .codepointCount (termAtt )) > 1 ) {
127- oovChars .setOov (offsetAtt .startOffset (), termAtt .buffer (), termAtt .length ());
96+ if (mode == Mode .EXTENDED && (length = Strings .codepointCount (termAtt )) > 1 ) {
97+ // OovChars requires character length
98+ oovChars .setOov (termAtt .buffer (), termAtt .length ());
99+ // Position length should be codepoint length
128100 posLengthAtt .setPositionLength (length );
129- } else if (splitMode != Tokenizer .SplitMode .C ) {
130- List <Morpheme > subUnits = m .split (splitMode );
131- if (subUnits .size () > 1 ) {
132- aUnitIterator = subUnits .listIterator ();
133- aUnitOffset = offsetAtt .startOffset ();
134- posLengthAtt .setPositionLength (subUnits .size ());
135- } else {
136- posLengthAtt .setPositionLength (1 );
137- }
138101 }
139102 return true ;
140- } else {
141- return false ;
142103 }
104+
105+ // C split is the longest split
106+ if (splitMode == Tokenizer .SplitMode .C ) {
107+ return true ;
108+ }
109+
110+ // split into A/B units
111+ List <Morpheme > subsplits = m .split (splitMode );
112+ if (subsplits .size () > 1 ) {
113+ subunits .setUnits (subsplits );
114+ posLengthAtt .setPositionLength (subunits .size ());
115+ }
116+
117+ return true ;
118+ }
119+
120+ private int correctOffset (int currectOff ) {
121+ // assert (0 <= currectOff && currectOff <= this.offsetMap.size());
122+ return this .offsetMap .get (currectOff );
143123 }
144124
145- private void setAUnitAttribute (Morpheme morpheme ) {
125+ private void setAUnitAttribute () {
146126 posLengthAtt .setPositionLength (1 );
147- if (aUnitIterator . previousIndex () == 0 ) {
127+ if (subunits . index () == 0 ) {
148128 posIncAtt .setPositionIncrement (0 );
149129 } else {
150130 posIncAtt .setPositionIncrement (1 );
151131 }
152- int length = morpheme .end () - morpheme .begin ();
153- offsetAtt .setOffset (aUnitOffset , aUnitOffset + length );
154- aUnitOffset += length ;
155- morphemeAtt .setMorpheme (morpheme );
156- termAtt .setEmpty ().append (morpheme .surface ());
132+
133+ MorphemeSubunits .Subunit su = subunits .next ();
134+ termAtt .setEmpty ().append (su .morpheme .surface ());
135+ morphemeAtt .setMorpheme (su .morpheme );
136+ morphemeAtt .setOffsets (offsetMap .subList (su .begin , su .end + 1 ));
137+ offsetAtt .setOffset (correctOffset (su .begin ), correctOffset (su .end ));
157138 }
158139
159140 private void setOOVAttribute () {
160- int offset = oovChars .offset ();
161141 posLengthAtt .setPositionLength (1 );
162142 if (oovChars .index () == 0 ) {
163143 posIncAtt .setPositionIncrement (0 );
164144 } else {
165145 posIncAtt .setPositionIncrement (1 );
166146 }
147+
148+ int startOffset = oovChars .offset ();
167149 char c = oovChars .next ();
168150 termAtt .setEmpty ().append (c );
169151 if (Character .isSurrogate (c ) && oovChars .hasNext ()) {
170152 termAtt .append (oovChars .next ());
171- offsetAtt .setOffset (offset , offset + 2 );
172- } else {
173- offsetAtt .setOffset (offset , offset + 1 );
153+ }
154+ int endOffset = oovChars .offset ();
155+ offsetAtt .setOffset (correctOffset (startOffset ), correctOffset (endOffset ));
156+ }
157+
158+ static class OovChars {
159+ private int reserved ;
160+ private char [] buffer = new char [0 ];
161+ private int length ;
162+ private int index ;
163+
164+ public void setOov (char [] src , int length ) {
165+ this .length = length ;
166+ if (reserved < length ) {
167+ buffer = new char [length ];
168+ reserved = length ;
169+ }
170+ System .arraycopy (src , 0 , buffer , 0 , length );
171+ index = 0 ;
172+ }
173+
174+ public boolean hasNext () {
175+ return index < length ;
176+ }
177+
178+ public char next () {
179+ if (index < length ) {
180+ return buffer [index ++];
181+ }
182+ throw new IllegalStateException ();
183+ }
184+
185+ public int index () {
186+ return index ;
187+ }
188+
189+ public int offset () {
190+ return index ;
191+ }
192+ }
193+
194+ static class MorphemeSubunits {
195+ static class Subunit {
196+ final Morpheme morpheme ;
197+ final int begin ;
198+ final int end ;
199+
200+ public Subunit (Morpheme morpheme , int begin , int end ) {
201+ this .morpheme = morpheme ;
202+ this .begin = begin ;
203+ this .end = end ;
204+ }
205+ }
206+
207+ private List <Morpheme > morphemes ;
208+ private int size ;
209+ private int index ;
210+ private int baseOffset ;
211+
212+ public void setUnits (List <Morpheme > morphemes ) {
213+ this .morphemes = morphemes ;
214+ size = morphemes .size ();
215+ index = 0 ;
216+ baseOffset = morphemes .get (0 ).begin ();
217+ }
218+
219+ public boolean hasNext () {
220+ return index < size ;
221+ }
222+
223+ public Subunit next () {
224+ if (!hasNext ()) {
225+ throw new IllegalStateException ();
226+ }
227+ Morpheme m = morphemes .get (index ++);
228+ return new Subunit (m , m .begin () - baseOffset , m .end () - baseOffset );
229+ }
230+
231+ public int size () {
232+ return size ;
233+ }
234+
235+ public int index () {
236+ return index ;
174237 }
175238 }
176239}
0 commit comments