revert: substring heuristic algorithm (#155)

SyMind · web-flow · commit 51c8677216bb · 2025-01-22T18:24:51.000+08:00
diff --git a/src/helpers.rs b/src/helpers.rs
@@ -1,7 +1,6 @@
 use std::{
   borrow::{BorrowMut, Cow},
   cell::{OnceCell, RefCell},
-  fmt::Debug,
   marker::PhantomData,
   ops::Range,
 };
@@ -1244,7 +1243,7 @@ pub fn stream_and_get_source_and_map<'a, S: StreamChunks>(
 }
 
 /// Represents a text source that can be manipulated for source mapping purposes.
-pub trait SourceText<'a>: Default + Clone + ToString + Debug {
+pub trait SourceText<'a>: Default + Clone + ToString {
   /// Splits the text into lines, returning an iterator over each line.
   /// Each line includes its line ending character if present.
   fn split_into_lines(&self) -> impl Iterator<Item = Self>;
@@ -1253,7 +1252,7 @@ pub trait SourceText<'a>: Default + Clone + ToString + Debug {
   fn ends_with(&self, value: &str) -> bool;
 
   /// Returns an iterator over the char indices in the text.
-  fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)>;
+  fn char_indices(&self) -> impl Iterator<Item = (usize, char)>;
 
   /// Gets the byte at the specified index, if it exists.
   fn get_byte(&self, byte_index: usize) -> Option<u8>;
@@ -1290,7 +1289,7 @@ impl<'a> SourceText<'a> for Rope<'a> {
     (*self).ends_with(value)
   }
 
-  fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)> {
+  fn char_indices(&self) -> impl Iterator<Item = (usize, char)> {
     self.char_indices()
   }
 
@@ -1332,7 +1331,7 @@ impl<'a> SourceText<'a> for &'a str {
     (*self).ends_with(value)
   }
 
-  fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)> {
+  fn char_indices(&self) -> impl Iterator<Item = (usize, char)> {
     (*self).char_indices()
   }
 
diff --git a/src/rope.rs b/src/rope.rs
@@ -2,6 +2,7 @@
 
 use std::{
   borrow::Cow,
+  collections::VecDeque,
   hash::Hash,
   ops::{Bound, RangeBounds},
   rc::Rc,
@@ -133,24 +134,13 @@ impl<'a> Rope<'a> {
           iter: s.char_indices(),
         },
       },
-      Repr::Full(vec) => {
-        let right_byte_offset = vec.iter().map(|(s, _)| s.len() as u32).sum();
-
-        CharIndices {
-          iter: CharIndicesEnum::Full {
-            iters: vec
-              .iter()
-              .map(|(s, _)| s.char_indices())
-              .collect::<Vec<_>>(),
-            left_chunk_index: 0,
-            left_byte_offset: 0,
-            last_left_indice: None,
-            right_chunk_index: (vec.len() - 1) as u32,
-            right_byte_offset,
-            right_byte_offset_for: vec.len() as u32,
-          },
-        }
-      }
+      Repr::Full(data) => CharIndices {
+        iter: CharIndicesEnum::Full {
+          chunks: data,
+          char_indices: VecDeque::new(),
+          chunk_index: 0,
+        },
+      },
     }
   }
 
@@ -668,13 +658,9 @@ enum CharIndicesEnum<'a, 'b> {
     iter: std::str::CharIndices<'b>,
   },
   Full {
-    iters: Vec<std::str::CharIndices<'a>>,
-    left_chunk_index: u32,
-    left_byte_offset: u32,
-    last_left_indice: Option<(usize, char)>,
-    right_chunk_index: u32,
-    right_byte_offset: u32,
-    right_byte_offset_for: u32,
+    chunks: &'a [(&'b str, usize)],
+    char_indices: VecDeque<(usize, char)>,
+    chunk_index: usize,
   },
 }
 
@@ -689,59 +675,29 @@ impl Iterator for CharIndices<'_, '_> {
     match &mut self.iter {
       CharIndicesEnum::Light { iter } => iter.next(),
       CharIndicesEnum::Full {
-        iters,
-        left_chunk_index,
-        left_byte_offset,
-        last_left_indice,
-        ..
+        chunks,
+        char_indices,
+        chunk_index,
       } => {
-        if (*left_chunk_index as usize) >= iters.len() {
-          return None;
+        if let Some(item) = char_indices.pop_front() {
+          return Some(item);
         }
-        if let Some((byte_index, char)) =
-          iters[*left_chunk_index as usize].next()
-        {
-          *last_left_indice = Some((byte_index, char));
-          Some((byte_index + (*left_byte_offset as usize), char))
-        } else {
-          *left_chunk_index += 1;
-          if let Some((byte_index, char)) = last_left_indice.take() {
-            *left_byte_offset =
-              *left_byte_offset + byte_index as u32 + char.len_utf8() as u32;
-          }
-          self.next()
+
+        if *chunk_index >= chunks.len() {
+          return None;
         }
-      }
-    }
-  }
-}
 
-impl DoubleEndedIterator for CharIndices<'_, '_> {
-  fn next_back(&mut self) -> Option<Self::Item> {
-    match &mut self.iter {
-      CharIndicesEnum::Light { iter } => iter.next_back(),
-      CharIndicesEnum::Full {
-        iters,
-        right_chunk_index,
-        right_byte_offset,
-        right_byte_offset_for,
-        ..
-      } => {
-        if let Some((byte_index, char)) =
-          iters[*right_chunk_index as usize].next_back()
-        {
-          if *right_byte_offset_for != *right_chunk_index {
-            *right_byte_offset =
-              *right_byte_offset - byte_index as u32 - char.len_utf8() as u32;
-            *right_byte_offset_for = *right_chunk_index;
-          }
-          Some((byte_index + (*right_byte_offset as usize), char))
-        } else if *right_chunk_index > 0 {
-          *right_chunk_index -= 1;
-          self.next_back()
-        } else {
-          None
+        // skip empty chunks
+        while *chunk_index < chunks.len() && chunks[*chunk_index].0.is_empty() {
+          *chunk_index += 1;
         }
+
+        let (chunk, start_pos) = chunks[*chunk_index];
+
+        char_indices
+          .extend(chunk.char_indices().map(|(i, c)| (start_pos + i, c)));
+        *chunk_index += 1;
+        char_indices.pop_front()
       }
     }
   }
@@ -1212,29 +1168,6 @@ mod tests {
     );
   }
 
-  #[test]
-  fn reverse_char_indices() {
-    let mut a = Rope::new();
-    a.add("abc");
-    a.add("def");
-    assert_eq!(
-      a.char_indices().rev().collect::<Vec<_>>(),
-      "abcdef".char_indices().rev().collect::<Vec<_>>()
-    );
-
-    let mut a = Rope::new();
-    a.add("こんにちは");
-    assert_eq!(
-      a.char_indices().rev().collect::<Vec<_>>(),
-      "こんにちは".char_indices().rev().collect::<Vec<_>>()
-    );
-    a.add("世界");
-    assert_eq!(
-      a.char_indices().rev().collect::<Vec<_>>(),
-      "こんにちは世界".char_indices().rev().collect::<Vec<_>>()
-    );
-  }
-
   #[test]
   fn lines1() {
     let rope = Rope::from("abc");
diff --git a/src/with_indices.rs b/src/with_indices.rs
@@ -1,4 +1,4 @@
-use std::{cell::Cell, marker::PhantomData};
+use std::{cell::OnceCell, marker::PhantomData};
 
 use crate::helpers::SourceText;
 
@@ -9,7 +9,8 @@ where
 {
   /// line is a string reference
   pub line: S,
-  last_char_index_to_byte_index: Cell<(u32, u32)>,
+  /// the byte position of each `char` in `line` string slice .
+  pub indices_indexes: OnceCell<Vec<usize>>,
   data: PhantomData<&'a S>,
 }
 
@@ -19,102 +20,32 @@ where
 {
   pub fn new(line: S) -> Self {
     Self {
+      indices_indexes: OnceCell::new(),
       line,
-      last_char_index_to_byte_index: Cell::new((0, 0)),
       data: PhantomData,
     }
   }
 
-  pub(crate) fn substring(
-    &self,
-    start_char_index: usize,
-    end_char_index: usize,
-  ) -> S {
-    if end_char_index <= start_char_index {
+  /// substring::SubString with cache
+  pub(crate) fn substring(&self, start_index: usize, end_index: usize) -> S {
+    if end_index <= start_index {
       return S::default();
     }
 
-    let line_len = self.line.len();
-    let mut start_byte_index =
-      if start_char_index == 0 { Some(0) } else { None };
-    let mut end_byte_index = if end_char_index == usize::MAX {
-      Some(line_len)
-    } else {
-      None
-    };
+    let indices_indexes = self.indices_indexes.get_or_init(|| {
+      self.line.char_indices().map(|(i, _)| i).collect::<Vec<_>>()
+    });
 
-    if start_byte_index.is_some() && end_byte_index.is_some() {
-      return self.line.clone();
-    }
-
-    let (last_char_index, last_byte_index) =
-      self.last_char_index_to_byte_index.get();
-    let byte_index = last_byte_index as usize;
-
-    if start_char_index >= last_char_index as usize
-      || end_char_index >= last_char_index as usize
-    {
-      let mut char_index = last_char_index as usize;
-      #[allow(unsafe_code)]
-      let slice = unsafe {
-        // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
-        // that the indices obtained from it will always be within the bounds of `self` and they
-        // will always lie on UTF-8 sequence boundaries.
-        self.line.byte_slice_unchecked(byte_index..line_len)
-      };
-      for (byte_offset, _) in slice.char_indices() {
-        if char_index == start_char_index {
-          start_byte_index = Some(byte_index + byte_offset);
-          if end_byte_index.is_some() {
-            break;
-          }
-        } else if char_index == end_char_index {
-          end_byte_index = Some(byte_index + byte_offset);
-          self
-            .last_char_index_to_byte_index
-            .set((char_index as u32, (byte_index + byte_offset) as u32));
-          break;
-        }
-        char_index += 1;
-      }
-    }
-
-    if start_char_index < last_char_index as usize
-      || end_char_index < last_char_index as usize
-    {
-      let mut char_index = last_char_index as usize;
-      #[allow(unsafe_code)]
-      let slice = unsafe {
-        // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
-        // that the indices obtained from it will always be within the bounds of `self` and they
-        // will always lie on UTF-8 sequence boundaries.
-        self.line.byte_slice_unchecked(0..byte_index)
-      };
-      for (byte_index, char) in slice.char_indices().rev() {
-        if char_index == end_char_index {
-          end_byte_index = Some(byte_index + char.len_utf8());
-          if start_byte_index.is_some() {
-            break;
-          }
-        } else if char_index == start_char_index {
-          start_byte_index = Some(byte_index + char.len_utf8());
-          break;
-        }
-        char_index -= 1;
-      }
-    }
-
-    let start_byte_index = start_byte_index.unwrap_or(line_len);
-    let end_byte_index = end_byte_index.unwrap_or(line_len);
+    let str_len = self.line.len();
+    let start = *indices_indexes.get(start_index).unwrap_or(&str_len);
+    let end = *indices_indexes.get(end_index).unwrap_or(&str_len);
 
     #[allow(unsafe_code)]
     unsafe {
       // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
       // that the indices obtained from it will always be within the bounds of `self` and they
       // will always lie on UTF-8 sequence boundaries.
-      self
-        .line
-        .byte_slice_unchecked(start_byte_index..end_byte_index)
+      self.line.byte_slice_unchecked(start..end)
     }
   }
 }
@@ -159,19 +90,4 @@ mod tests {
       "øbα"
     );
   }
-
-  #[test]
-  fn test_last_char_index_to_byte_index() {
-    let rope_with_indices =
-      WithIndices::new(Rope::from("hello world 你好世界"));
-    assert_eq!(rope_with_indices.substring(10, 13), "d 你");
-    assert_eq!(rope_with_indices.substring(13, 15), "好世");
-    assert_eq!(rope_with_indices.substring(10, 13), "d 你");
-
-    let rope_with_indices =
-      WithIndices::new(Rope::from("export const answer = 42;\n"));
-    assert_eq!(rope_with_indices.substring(7, 13), "const ");
-    assert_eq!(rope_with_indices.substring(13, 19), "answer");
-    assert_eq!(rope_with_indices.substring(7, 22), "const answer = ");
-  }
 }