Skip to content

Commit 4c89760

Browse files
authored
chore: use chunk_overlap as minimum atom size, if specified (#1160)
1 parent 276b185 commit 4c89760

File tree

1 file changed

+21
-10
lines changed

1 file changed

+21
-10
lines changed

src/ops/functions/split_recursively.rs

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,7 @@ struct RecursiveChunker<'s> {
511511
chunk_size: usize,
512512
chunk_overlap: usize,
513513
min_chunk_size: usize,
514+
min_atom_chunk_size: usize,
514515
}
515516

516517
impl<'t, 's: 't> RecursiveChunker<'s> {
@@ -526,7 +527,7 @@ impl<'t, 's: 't> RecursiveChunker<'s> {
526527
atom_collector.curr_level = iter_stack.len();
527528

528529
if let Some(current_chunk) = iter_stack.last_mut().unwrap().next() {
529-
if current_chunk.range.len() <= self.min_chunk_size {
530+
if current_chunk.range.len() <= self.min_atom_chunk_size {
530531
atom_collector.collect(current_chunk.range);
531532
} else {
532533
match current_chunk.kind {
@@ -819,19 +820,29 @@ impl SimpleFunctionExecutor for Executor {
819820
async fn evaluate(&self, input: Vec<Value>) -> Result<Value> {
820821
let full_text = self.args.text.value(&input)?.as_str()?;
821822
let chunk_size = self.args.chunk_size.value(&input)?.as_int64()?;
822-
let recursive_chunker = RecursiveChunker {
823-
full_text,
824-
chunk_size: chunk_size as usize,
825-
chunk_overlap: (self.args.chunk_overlap.value(&input)?)
823+
let min_chunk_size = (self.args.min_chunk_size.value(&input)?)
824+
.optional()
825+
.map(|v| v.as_int64())
826+
.transpose()?
827+
.unwrap_or(chunk_size / 2) as usize;
828+
let chunk_overlap = std::cmp::min(
829+
(self.args.chunk_overlap.value(&input)?)
826830
.optional()
827831
.map(|v| v.as_int64())
828832
.transpose()?
829833
.unwrap_or(0) as usize,
830-
min_chunk_size: (self.args.min_chunk_size.value(&input)?)
831-
.optional()
832-
.map(|v| v.as_int64())
833-
.transpose()?
834-
.unwrap_or(chunk_size / 2) as usize,
834+
min_chunk_size,
835+
);
836+
let recursive_chunker = RecursiveChunker {
837+
full_text,
838+
chunk_size: chunk_size as usize,
839+
chunk_overlap,
840+
min_chunk_size,
841+
min_atom_chunk_size: if chunk_overlap > 0 {
842+
chunk_overlap
843+
} else {
844+
min_chunk_size
845+
},
835846
};
836847

837848
let language = UniCase::new(

0 commit comments

Comments
 (0)