diff --git a/.github/workflows/Bench.yaml b/.github/workflows/Bench.yaml index ef6ef1d2..da30b16d 100644 --- a/.github/workflows/Bench.yaml +++ b/.github/workflows/Bench.yaml @@ -29,7 +29,7 @@ jobs: - name: Install codspeed uses: taiki-e/install-action@v2 with: - tool: cargo-codspeed + tool: cargo-codspeed@4.0.5 - name: Build Benchmark run: cargo codspeed build --features codspeed diff --git a/benches/benchmark_repetitive_react_components.rs b/benches/benchmark_repetitive_react_components.rs index 7f571551..602282ee 100644 --- a/benches/benchmark_repetitive_react_components.rs +++ b/benches/benchmark_repetitive_react_components.rs @@ -9,9 +9,9 @@ pub use criterion::*; pub use codspeed_criterion_compat::*; use rspack_sources::{ - BoxSource, CachedSource, ConcatSource, MapOptions, ObjectPool, - OriginalSource, RawStringSource, ReplaceSource, ReplacementEnforce, Source, - SourceExt, SourceMap, SourceMapSource, SourceMapSourceOptions, + BoxSource, ConcatSource, MapOptions, ObjectPool, OriginalSource, + RawStringSource, ReplaceSource, ReplacementEnforce, Source, SourceExt, + SourceMap, SourceMapSource, SourceMapSourceOptions, }; static REPETITIVE_1K_REACT_COMPONENTS_SOURCE: LazyLock = diff --git a/src/cached_source.rs b/src/cached_source.rs index c7906445..36a2e0e2 100644 --- a/src/cached_source.rs +++ b/src/cached_source.rs @@ -1,6 +1,5 @@ use std::{ borrow::Cow, - cell::OnceCell, hash::{Hash, Hasher}, sync::{Arc, OnceLock}, }; @@ -21,6 +20,7 @@ use crate::{ struct CachedData { hash: OnceLock, size: OnceLock, + chunks: OnceLock>, columns_map: OnceLock>, line_only_map: OnceLock>, } @@ -79,11 +79,37 @@ impl CachedSource { cache: Arc::new(CachedData::default()), } } + + fn get_or_init_chunks(&self) -> &[&str] { + self.cache.chunks.get_or_init(|| { + let mut chunks = Vec::new(); + self.inner.rope(&mut |chunk| { + chunks.push(chunk); + }); + #[allow(unsafe_code)] + // SAFETY: CachedSource guarantees that the underlying source outlives the cache, + // so transmuting Vec<&str> to Vec<&'static str> is safe in this context. + // This allows us to store string slices in the cache without additional allocations. + unsafe { + std::mem::transmute::, Vec<&'static str>>(chunks) + } + }) + } } impl Source for CachedSource { fn source(&self) -> SourceValue { - self.inner.source() + let chunks = self.get_or_init_chunks(); + let mut string = String::with_capacity(self.size()); + for chunk in chunks { + string.push_str(chunk); + } + SourceValue::String(Cow::Owned(string)) + } + + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { + let chunks = self.get_or_init_chunks(); + chunks.iter().for_each(|chunk| on_chunk(chunk)); } fn buffer(&self) -> Cow<[u8]> { @@ -91,7 +117,12 @@ impl Source for CachedSource { } fn size(&self) -> usize { - *self.cache.size.get_or_init(|| self.inner.size()) + *self.cache.size.get_or_init(|| { + if let Some(chunks) = self.cache.chunks.get() { + return chunks.iter().fold(0, |acc, chunk| acc + chunk.len()); + } + self.inner.size() + }) } fn map( @@ -114,10 +145,6 @@ impl Source for CachedSource { } } - fn write_to_string(&self, string: &mut String) { - self.inner.write_to_string(string); - } - fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { self.inner.to_writer(writer) } @@ -126,17 +153,17 @@ impl Source for CachedSource { struct CachedSourceChunks<'source> { chunks: Box, cache: Arc, - inner: &'source dyn Source, - source: OnceCell>, + source: Cow<'source, str>, } impl<'a> CachedSourceChunks<'a> { fn new(cache_source: &'a CachedSource) -> Self { + let source = cache_source.source().into_string_lossy(); + Self { chunks: cache_source.inner.stream_chunks(), cache: cache_source.cache.clone(), - inner: &cache_source.inner, - source: OnceCell::new(), + source, } } } @@ -157,14 +184,11 @@ impl Chunks for CachedSourceChunks<'_> { }; match cell.get() { Some(map) => { - let source = self - .source - .get_or_init(|| self.inner.source().into_string_lossy()); if let Some(map) = map { stream_chunks_of_source_map( options, object_pool, - source.as_ref(), + self.source.as_ref(), map, on_chunk, on_source, @@ -172,7 +196,7 @@ impl Chunks for CachedSourceChunks<'_> { ) } else { stream_chunks_of_raw_source( - source.as_ref(), + self.source.as_ref(), options, on_chunk, on_source, diff --git a/src/concat_source.rs b/src/concat_source.rs index bb1eb7c0..e3a34db5 100644 --- a/src/concat_source.rs +++ b/src/concat_source.rs @@ -165,14 +165,24 @@ impl Source for ConcatSource { fn source(&self) -> SourceValue { let children = self.optimized_children(); if children.len() == 1 { - children[0].source() - } else { - // Use to_writer to avoid multiple heap allocations that would occur - // when concatenating nested ConcatSource instances directly - let mut string = String::with_capacity(self.size()); - self.write_to_string(&mut string); - SourceValue::String(Cow::Owned(string)) + return children[0].source(); } + + let mut string = String::with_capacity(self.size()); + let mut on_chunk = |chunk| { + string.push_str(chunk); + }; + children.iter().for_each(|child| { + child.rope(&mut on_chunk); + }); + SourceValue::String(Cow::Owned(string)) + } + + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { + let children = self.optimized_children(); + children.iter().for_each(|child| { + child.rope(on_chunk); + }); } fn buffer(&self) -> Cow<[u8]> { @@ -206,12 +216,6 @@ impl Source for ConcatSource { result } - fn write_to_string(&self, string: &mut String) { - for child in self.optimized_children() { - child.write_to_string(string); - } - } - fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { for child in self.optimized_children() { child.to_writer(writer)?; diff --git a/src/helpers.rs b/src/helpers.rs index 8cbd2c13..cd45bfb7 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -460,10 +460,9 @@ fn stream_chunks_of_source_map_full<'a>( on_source: OnSource<'_, 'a>, on_name: OnName<'_, 'a>, ) -> GeneratedInfo { - let a = split_into_lines(source); - let lines: Vec> = a + let lines = split_into_lines(source) .map(|line| WithUtf16::new(object_pool, line)) - .collect::>(); + .collect::>>(); if lines.is_empty() { return GeneratedInfo { diff --git a/src/original_source.rs b/src/original_source.rs index 3824fc37..5aebdef5 100644 --- a/src/original_source.rs +++ b/src/original_source.rs @@ -56,6 +56,10 @@ impl Source for OriginalSource { SourceValue::String(Cow::Borrowed(&self.value)) } + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { + on_chunk(self.value.as_ref()) + } + fn buffer(&self) -> Cow<[u8]> { Cow::Borrowed(self.value.as_bytes()) } @@ -73,10 +77,6 @@ impl Source for OriginalSource { get_map(object_pool, chunks.as_ref(), options) } - fn write_to_string(&self, string: &mut String) { - string.push_str(self.value.as_ref()); - } - fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { writer.write_all(self.value.as_bytes()) } diff --git a/src/raw_source.rs b/src/raw_source.rs index 3bc5abfd..3b82f69f 100644 --- a/src/raw_source.rs +++ b/src/raw_source.rs @@ -64,6 +64,10 @@ impl Source for RawStringSource { SourceValue::String(Cow::Borrowed(&self.0)) } + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { + on_chunk(self.0.as_ref()) + } + fn buffer(&self) -> Cow<[u8]> { Cow::Borrowed(self.0.as_bytes()) } @@ -76,10 +80,6 @@ impl Source for RawStringSource { None } - fn write_to_string(&self, string: &mut String) { - string.push_str(self.0.as_ref()); - } - fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { writer.write_all(self.0.as_bytes()) } @@ -210,6 +210,10 @@ impl Source for RawBufferSource { SourceValue::Buffer(Cow::Borrowed(&self.value)) } + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { + on_chunk(self.get_or_init_value_as_string()) + } + fn buffer(&self) -> Cow<[u8]> { Cow::Borrowed(&self.value) } @@ -222,10 +226,6 @@ impl Source for RawBufferSource { None } - fn write_to_string(&self, string: &mut String) { - string.push_str(self.get_or_init_value_as_string()); - } - fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { writer.write_all(&self.value) } diff --git a/src/replace_source.rs b/src/replace_source.rs index 2a196541..2ba91436 100644 --- a/src/replace_source.rs +++ b/src/replace_source.rs @@ -161,34 +161,115 @@ impl ReplaceSource { impl Source for ReplaceSource { fn source(&self) -> SourceValue { - let inner_source_code = self.inner.source().into_string_lossy(); + if self.replacements.is_empty() { + return self.inner.source(); + } + + let mut string = String::with_capacity(self.size()); + self.rope(&mut |chunk| { + string.push_str(chunk); + }); + SourceValue::String(Cow::Owned(string)) + } - // mut_string_push_str is faster that vec join - // concatenate strings benchmark, see https://github.com/hoodie/concatenation_benchmarks-rs + #[allow(unsafe_code)] + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { if self.replacements.is_empty() { - return SourceValue::String(inner_source_code); + return self.inner.rope(on_chunk); } - let capacity = self.size(); - let mut source_code = String::with_capacity(capacity); - let mut inner_pos = 0; - for replacement in &self.replacements { - if inner_pos < replacement.start { - let end_pos = (replacement.start as usize).min(inner_source_code.len()); - source_code.push_str(&inner_source_code[inner_pos as usize..end_pos]); + + let mut pos: usize = 0; + let mut replacement_idx: usize = 0; + let mut replacement_end: Option = None; + let mut next_replacement: Option = self + .replacements + .get(replacement_idx) + .map(|repl| repl.start as usize); + + self.inner.rope(&mut |chunk| { + let mut chunk_pos = 0; + let end_pos = pos + chunk.len(); + + // Skip over when it has been replaced + if let Some(replacement_end) = + replacement_end.filter(|replacement_end| *replacement_end > pos) + { + // Skip over the whole chunk + if replacement_end >= end_pos { + pos = end_pos; + return; + } + // Partially skip over chunk + chunk_pos = replacement_end - pos; + pos += chunk_pos; } - source_code.push_str(&replacement.content); - #[allow(clippy::manual_clamp)] + + // Is a replacement in the chunk? + while let Some(next_replacement_pos) = next_replacement + .filter(|next_replacement_pos| *next_replacement_pos < end_pos) { - inner_pos = inner_pos - .max(replacement.end) - .min(inner_source_code.len() as u32); + if next_replacement_pos > pos { + // Emit chunk until replacement + let offset = next_replacement_pos - pos; + let chunk_slice = + unsafe { chunk.get_unchecked(chunk_pos..(chunk_pos + offset)) }; + on_chunk(chunk_slice); + chunk_pos += offset; + pos = next_replacement_pos; + } + // Insert replacement content split into chunks by lines + let replacement = + unsafe { self.replacements.get_unchecked(replacement_idx) }; + on_chunk(&replacement.content); + + // Remove replaced content by settings this variable + replacement_end = if let Some(replacement_end) = replacement_end { + Some(replacement_end.max(replacement.end as usize)) + } else { + Some(replacement.end as usize) + }; + + // Move to next replacement + replacement_idx += 1; + next_replacement = self + .replacements + .get(replacement_idx) + .map(|repl| repl.start as usize); + + // Skip over when it has been replaced + let offset = chunk.len() as i64 - end_pos as i64 + + replacement_end.unwrap() as i64 + - chunk_pos as i64; + if offset > 0 { + // Skip over whole chunk + if replacement_end + .is_some_and(|replacement_end| replacement_end >= end_pos) + { + pos = end_pos; + return; + } + + // Partially skip over chunk + chunk_pos += offset as usize; + pos += offset as usize; + } } - } - source_code.push_str( - &inner_source_code[inner_pos as usize..inner_source_code.len()], - ); - SourceValue::String(Cow::Owned(source_code)) + // Emit remaining chunk + if chunk_pos < chunk.len() { + on_chunk(unsafe { chunk.get_unchecked(chunk_pos..) }); + } + pos = end_pos; + }); + + // Handle remaining replacements one by one + while replacement_idx < self.replacements.len() { + let replacement = + unsafe { self.replacements.get_unchecked(replacement_idx) }; + let content = &replacement.content; + on_chunk(content); + replacement_idx += 1; + } } fn buffer(&self) -> Cow<[u8]> { @@ -211,6 +292,10 @@ impl Source for ReplaceSource { if inner_pos < replacement.start { // This content is already counted in inner_source_size, so no change needed } + if replacement.start as usize >= inner_source_size { + size += replacement.content.len(); + continue; + } // Handle the replacement itself let original_length = replacement @@ -244,12 +329,15 @@ impl Source for ReplaceSource { get_map(&ObjectPool::default(), chunks.as_ref(), options) } - fn write_to_string(&self, string: &mut String) { - string.push_str(&self.source().into_string_lossy()); - } - fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { - writer.write_all(self.source().as_bytes()) + let mut result = Ok(()); + self.rope(&mut |chunk| { + if result.is_err() { + return; + } + result = writer.write_all(chunk.as_bytes()); + }); + result } } @@ -1421,6 +1509,7 @@ return
{data.foo}
None, ReplacementEnforce::Post, ); + source.replace(10000000, 20000000, "// end line", None); assert_eq!(source.size(), source.source().into_string_lossy().len()); } diff --git a/src/source.rs b/src/source.rs index c4ab26ff..00391196 100644 --- a/src/source.rs +++ b/src/source.rs @@ -114,6 +114,9 @@ pub trait Source: /// Get the source code. fn source(&self) -> SourceValue; + /// Return a lightweight "rope" view of the source as borrowed string slices. + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)); + /// Get the source buffer. fn buffer(&self) -> Cow<[u8]>; @@ -132,31 +135,32 @@ pub trait Source: self.dyn_hash(state); } - /// Appends the source content to the provided string buffer. - /// - /// This method efficiently writes the source content directly into an existing - /// string buffer, avoiding additional memory allocations when the buffer has - /// sufficient capacity. This is particularly useful for concatenating multiple - /// sources or building larger strings incrementally. - fn write_to_string(&self, string: &mut String); - /// Writes the source into a writer, preferably a `std::io::BufWriter`. fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()>; } impl Source for BoxSource { + #[inline] fn source(&self) -> SourceValue { self.as_ref().source() } + #[inline] + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { + self.as_ref().rope(on_chunk) + } + + #[inline] fn buffer(&self) -> Cow<[u8]> { self.as_ref().buffer() } + #[inline] fn size(&self) -> usize { self.as_ref().size() } + #[inline] fn map( &self, object_pool: &ObjectPool, @@ -165,10 +169,7 @@ impl Source for BoxSource { self.as_ref().map(object_pool, options) } - fn write_to_string(&self, string: &mut String) { - self.as_ref().write_to_string(string) - } - + #[inline] fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { self.as_ref().to_writer(writer) } diff --git a/src/source_map_source.rs b/src/source_map_source.rs index 5982eabd..0be2723a 100644 --- a/src/source_map_source.rs +++ b/src/source_map_source.rs @@ -94,6 +94,10 @@ impl Source for SourceMapSource { SourceValue::String(Cow::Borrowed(&self.value)) } + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { + on_chunk(&self.value) + } + fn buffer(&self) -> Cow<[u8]> { Cow::Borrowed(self.value.as_bytes()) } @@ -114,10 +118,6 @@ impl Source for SourceMapSource { get_map(object_pool, chunks.as_ref(), options) } - fn write_to_string(&self, string: &mut String) { - string.push_str(self.value.as_ref()); - } - fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { writer.write_all(self.value.as_bytes()) } diff --git a/src/with_utf16.rs b/src/with_utf16.rs index f874de4e..74c58319 100644 --- a/src/with_utf16.rs +++ b/src/with_utf16.rs @@ -7,7 +7,7 @@ pub struct WithUtf16<'object_pool, 'text> { /// line is a string reference pub line: &'text str, /// the byte position of each `char` in `line` string slice . - pub utf16_byte_indices: OnceCell>, + pub utf16_byte_indices: OnceCell>>, object_pool: &'object_pool ObjectPool, } @@ -21,31 +21,63 @@ impl<'object_pool, 'text> WithUtf16<'object_pool, 'text> { } /// substring::SubString with cache - pub fn substring(&self, start_index: usize, end_index: usize) -> &'text str { - if end_index <= start_index { + #[allow(unsafe_code)] + pub fn substring( + &self, + start_utf16_index: usize, + end_utf16_index: usize, + ) -> &'text str { + if end_utf16_index <= start_utf16_index { return ""; } let utf16_byte_indices = self.utf16_byte_indices.get_or_init(|| { + if self.line.is_ascii() { + return None; + } + let mut vec = self.object_pool.pull(self.line.len()); - for (byte_index, ch) in self.line.char_indices() { - match ch.len_utf16() { - 1 => vec.push(byte_index), - 2 => { - vec.push(byte_index); - vec.push(byte_index); - } - _ => unreachable!(), + let bytes = self.line.as_bytes(); + let mut byte_pos = 0; + while byte_pos < bytes.len() { + let byte = unsafe { *bytes.get_unchecked(byte_pos) }; + if byte < 0x80 { + // ASCII: 1 byte = 1 UTF-16 unit + vec.push(byte_pos); + byte_pos += 1; + } else if byte < 0xE0 { + // 2-byte UTF-8 = 1 UTF-16 unit + vec.push(byte_pos); + byte_pos += 2; + } else if byte < 0xF0 { + // 3-byte UTF-8 = 1 UTF-16 unit + vec.push(byte_pos); + byte_pos += 3; + } else { + // 4-byte UTF-8 = 2 UTF-16 units (surrogate pair) + vec.push(byte_pos); + vec.push(byte_pos); + byte_pos += 4; } } - vec + Some(vec) }); - let str_len = self.line.len(); - let start = *utf16_byte_indices.get(start_index).unwrap_or(&str_len); - let end = *utf16_byte_indices.get(end_index).unwrap_or(&str_len); + let utf8_len = self.line.len(); + + let Some(utf16_byte_indices) = utf16_byte_indices else { + let start_utf16_index = start_utf16_index.min(utf8_len); + let end_utf16_index = end_utf16_index.min(utf8_len); + return unsafe { + self.line.get_unchecked(start_utf16_index..end_utf16_index) + }; + }; + + let start = *utf16_byte_indices + .get(start_utf16_index) + .unwrap_or(&utf8_len); + let end = *utf16_byte_indices.get(end_utf16_index).unwrap_or(&utf8_len); - #[allow(unsafe_code)] unsafe { // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee // that the indices obtained from it will always be within the bounds of `self` and they diff --git a/tests/compat_source.rs b/tests/compat_source.rs index 8da41a8a..8048de76 100644 --- a/tests/compat_source.rs +++ b/tests/compat_source.rs @@ -19,6 +19,10 @@ impl Source for CompatSource { SourceValue::String(Cow::Borrowed(self.0)) } + fn rope<'a>(&'a self, on_chunk: &mut dyn FnMut(&'a str)) { + on_chunk(self.0) + } + fn buffer(&self) -> Cow<[u8]> { Cow::Borrowed(self.0.as_bytes()) } @@ -35,10 +39,6 @@ impl Source for CompatSource { self.1.clone() } - fn write_to_string(&self, string: &mut String) { - string.push_str(self.0.as_ref()) - } - fn to_writer(&self, writer: &mut dyn std::io::Write) -> std::io::Result<()> { writer.write_all(self.0.as_bytes()) }