@@ -47,10 +47,17 @@ static DEFAULT_LANGUAGE_CONFIG: LazyLock<SimpleLanguageConfig> =
4747 LazyLock :: new ( || SimpleLanguageConfig {
4848 name : "_DEFAULT" . to_string ( ) ,
4949 aliases : vec ! [ ] ,
50- separator_regex : [ r"\n\n+" , r"\n" , r"\s+" ]
51- . into_iter ( )
52- . map ( |s| Regex :: new ( s) . unwrap ( ) )
53- . collect ( ) ,
50+ separator_regex : [
51+ r"\n\n+" ,
52+ r"\n" ,
53+ r"[\.\?!]\s+|。|?|!" ,
54+ r"[;:—\-]\s+|;|:|—+" ,
55+ r",\s+|," ,
56+ r"\s+" ,
57+ ]
58+ . into_iter ( )
59+ . map ( |s| Regex :: new ( s) . unwrap ( ) )
60+ . collect ( ) ,
5461 } ) ;
5562
5663struct TreesitterLanguageConfig {
@@ -1284,7 +1291,7 @@ mod tests {
12841291 custom_languages : vec ! [ ] ,
12851292 } ;
12861293 let factory = Arc :: new ( Factory ) ;
1287- let text = " \n First chunk. \n \n Second chunk with spaces at the end. \n " ;
1294+ let text = " \n First chunk \n \n Second chunk with spaces at the end \n " ;
12881295 let input_arg_schemas = & build_split_recursively_arg_schemas ( ) ;
12891296
12901297 {
@@ -1312,9 +1319,9 @@ mod tests {
13121319 assert_eq ! ( table. len( ) , 3 ) ;
13131320
13141321 let expected_chunks = vec ! [
1315- ( RangeValue :: new( 3 , 16 ) , " First chunk. " ) ,
1322+ ( RangeValue :: new( 3 , 15 ) , " First chunk" ) ,
13161323 ( RangeValue :: new( 19 , 45 ) , " Second chunk with spaces" ) ,
1317- ( RangeValue :: new( 46 , 57 ) , "at the end. " ) ,
1324+ ( RangeValue :: new( 46 , 56 ) , "at the end" ) ,
13181325 ] ;
13191326
13201327 for ( range, expected_text) in expected_chunks {
0 commit comments