Skip to content

Commit 276b185

Browse files
authored
chore: add more punctuations for pure text split (#1159)
1 parent 08c456f commit 276b185

File tree

1 file changed

+14
-7
lines changed

1 file changed

+14
-7
lines changed

src/ops/functions/split_recursively.rs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,17 @@ static DEFAULT_LANGUAGE_CONFIG: LazyLock<SimpleLanguageConfig> =
4747
LazyLock::new(|| SimpleLanguageConfig {
4848
name: "_DEFAULT".to_string(),
4949
aliases: vec![],
50-
separator_regex: [r"\n\n+", r"\n", r"\s+"]
51-
.into_iter()
52-
.map(|s| Regex::new(s).unwrap())
53-
.collect(),
50+
separator_regex: [
51+
r"\n\n+",
52+
r"\n",
53+
r"[\.\?!]\s+|。|?|!",
54+
r"[;:—\-]\s+|;|:|—+",
55+
r",\s+|,",
56+
r"\s+",
57+
]
58+
.into_iter()
59+
.map(|s| Regex::new(s).unwrap())
60+
.collect(),
5461
});
5562

5663
struct TreesitterLanguageConfig {
@@ -1284,7 +1291,7 @@ mod tests {
12841291
custom_languages: vec![],
12851292
};
12861293
let factory = Arc::new(Factory);
1287-
let text = " \n First chunk. \n\n Second chunk with spaces at the end. \n";
1294+
let text = " \n First chunk \n\n Second chunk with spaces at the end \n";
12881295
let input_arg_schemas = &build_split_recursively_arg_schemas();
12891296

12901297
{
@@ -1312,9 +1319,9 @@ mod tests {
13121319
assert_eq!(table.len(), 3);
13131320

13141321
let expected_chunks = vec![
1315-
(RangeValue::new(3, 16), " First chunk."),
1322+
(RangeValue::new(3, 15), " First chunk"),
13161323
(RangeValue::new(19, 45), " Second chunk with spaces"),
1317-
(RangeValue::new(46, 57), "at the end."),
1324+
(RangeValue::new(46, 56), "at the end"),
13181325
];
13191326

13201327
for (range, expected_text) in expected_chunks {

0 commit comments

Comments
 (0)