From 73d26d0d97bfc684266b5c8180d00783d6d84eba Mon Sep 17 00:00:00 2001
From: Pascal Kuthe <pascal.kuthe@semimod.de>
Date: Wed, 10 Apr 2024 17:14:08 +0200
Subject: [PATCH] don't manually grapheme align ts highlights (#10310)

---
 helix-core/src/graphemes.rs   | 17 ----------
 helix-stdx/src/rope.rs        | 62 +++++++++++++++++++++++++++++++++++
 helix-term/src/ui/document.rs | 23 ++++++++++++-
 helix-term/src/ui/editor.rs   | 22 +++----------
 4 files changed, 88 insertions(+), 36 deletions(-)
diff --git a/helix-core/src/graphemes.rs b/helix-core/src/graphemes.rs
index 7cb5cd062..a02d6e4dd 100644
--- a/helix-core/src/graphemes.rs
+++ b/helix-core/src/graphemes.rs
@@ -278,23 +278,6 @@ pub fn ensure_grapheme_boundary_prev(slice: RopeSlice, char_idx: usize) -> usize
     }
 }
 
-/// Returns the passed byte index if it's already a grapheme boundary,
-/// or the next grapheme boundary byte index if not.
-#[must_use]
-#[inline]
-pub fn ensure_grapheme_boundary_next_byte(slice: RopeSlice, byte_idx: usize) -> usize {
-    if byte_idx == 0 {
-        byte_idx
-    } else {
-        // TODO: optimize so we're not constructing grapheme cursor twice
-        if is_grapheme_boundary_byte(slice, byte_idx) {
-            byte_idx
-        } else {
-            next_grapheme_boundary_byte(slice, byte_idx)
-        }
-    }
-}
-
 /// Returns whether the given char position is a grapheme boundary.
 #[must_use]
 pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool {
diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs
index 7e2549f5a..2695555e3 100644
--- a/helix-stdx/src/rope.rs
+++ b/helix-stdx/src/rope.rs
@@ -3,6 +3,7 @@
 pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex};
 pub use regex_cursor::regex_automata::util::syntax::Config;
 use regex_cursor::{Input as RegexInput, RopeyCursor};
+use ropey::str_utils::byte_to_char_idx;
 use ropey::RopeSlice;
 
 pub trait RopeSliceExt<'a>: Sized {
@@ -16,6 +17,23 @@ fn regex_input_at_bytes<R: RangeBounds<usize>>(
     fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>>;
     fn first_non_whitespace_char(self) -> Option<usize>;
     fn last_non_whitespace_char(self) -> Option<usize>;
+    /// returns the char idx of `byte_idx`, if `byte_idx` is a char boundary
+    /// this function behaves the same as `byte_to_char` but if `byte_idx` is
+    /// not a valid char boundary (so within a char) this will return the next
+    /// char index.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use ropey::RopeSlice;
+    /// # use helix_stdx::rope::RopeSliceExt;
+    /// let text = RopeSlice::from("😆");
+    /// for i in 1..text.len_bytes() {
+    ///     assert_eq!(text.byte_to_char(i), 0);
+    ///     assert_eq!(text.byte_to_next_char(i), 1);
+    /// }
+    /// ```
+    fn byte_to_next_char(self, byte_idx: usize) -> usize;
 }
 
 impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
@@ -75,4 +93,48 @@ fn last_non_whitespace_char(self) -> Option<usize> {
             .position(|ch| !ch.is_whitespace())
             .map(|pos| self.len_chars() - pos - 1)
     }
+
+    /// returns the char idx of `byte_idx`, if `byte_idx` is
+    /// a char boundary this function behaves the same as `byte_to_char`
+    fn byte_to_next_char(self, mut byte_idx: usize) -> usize {
+        let (chunk, chunk_byte_off, chunk_char_off, _) = self.chunk_at_byte(byte_idx);
+        byte_idx -= chunk_byte_off;
+        let is_char_boundary =
+            is_utf8_char_boundary(chunk.as_bytes().get(byte_idx).copied().unwrap_or(0));
+        chunk_char_off + byte_to_char_idx(chunk, byte_idx) + !is_char_boundary as usize
+    }
+}
+
+// copied from std
+#[inline]
+const fn is_utf8_char_boundary(b: u8) -> bool {
+    // This is bit magic equivalent to: b < 128 || b >= 192
+    (b as i8) >= -0x40
+}
+
+#[cfg(test)]
+mod tests {
+    use ropey::RopeSlice;
+
+    use crate::rope::RopeSliceExt;
+
+    #[test]
+    fn next_char_at_byte() {
+        for i in 0..=6 {
+            assert_eq!(RopeSlice::from("foobar").byte_to_next_char(i), i);
+        }
+        for char_idx in 0..10 {
+            let len = "😆".len();
+            assert_eq!(
+                RopeSlice::from("😆😆😆😆😆😆😆😆😆😆").byte_to_next_char(char_idx * len),
+                char_idx
+            );
+            for i in 1..=len {
+                assert_eq!(
+                    RopeSlice::from("😆😆😆😆😆😆😆😆😆😆").byte_to_next_char(char_idx * len + i),
+                    char_idx + 1
+                );
+            }
+        }
+    }
 }
diff --git a/helix-term/src/ui/document.rs b/helix-term/src/ui/document.rs
index b571b83c2..bcbaa3519 100644
--- a/helix-term/src/ui/document.rs
+++ b/helix-term/src/ui/document.rs
@@ -7,6 +7,7 @@
 use helix_core::syntax::HighlightEvent;
 use helix_core::text_annotations::TextAnnotations;
 use helix_core::{visual_offset_from_block, Position, RopeSlice};
+use helix_stdx::rope::RopeSliceExt;
 use helix_view::editor::{WhitespaceConfig, WhitespaceRenderValue};
 use helix_view::graphics::Rect;
 use helix_view::theme::Style;
@@ -32,14 +33,27 @@ fn render_background(&mut self, renderer: &mut TextRenderer, pos: LinePos) {
     }
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum StyleIterKind {
+    /// base highlights (usually emitted by TS), byte indices (potentially not codepoint aligned)
+    BaseHighlights,
+    /// overlay highlights (emitted by custom code from selections), char indices
+    Overlay,
+}
+
 /// A wrapper around a HighlightIterator
 /// that merges the layered highlights to create the final text style
 /// and yields the active text style and the char_idx where the active
 /// style will have to be recomputed.
+///
+/// TODO(ropey2): hopefully one day helix and ropey will operate entirely
+/// on byte ranges and we can remove this
 struct StyleIter<'a, H: Iterator<Item = HighlightEvent>> {
     text_style: Style,
     active_highlights: Vec<Highlight>,
     highlight_iter: H,
+    kind: StyleIterKind,
+    text: RopeSlice<'a>,
     theme: &'a Theme,
 }
 
@@ -54,7 +68,7 @@ fn next(&mut self) -> Option<(Style, usize)> {
                 HighlightEvent::HighlightEnd => {
                     self.active_highlights.pop();
                 }
-                HighlightEvent::Source { start, end } => {
+                HighlightEvent::Source { start, mut end } => {
                     if start == end {
                         continue;
                     }
@@ -64,6 +78,9 @@ fn next(&mut self) -> Option<(Style, usize)> {
                         .fold(self.text_style, |acc, span| {
                             acc.patch(self.theme.highlight(span.0))
                         });
+                    if self.kind == StyleIterKind::BaseHighlights {
+                        end = self.text.byte_to_next_char(end);
+                    }
                     return Some((style, end));
                 }
             }
@@ -185,13 +202,17 @@ pub fn render_text<'t>(
         text_style: renderer.text_style,
         active_highlights: Vec::with_capacity(64),
         highlight_iter: syntax_highlight_iter,
+        kind: StyleIterKind::BaseHighlights,
         theme,
+        text,
     };
     let mut overlay_styles = StyleIter {
         text_style: Style::default(),
         active_highlights: Vec::with_capacity(64),
         highlight_iter: overlay_highlight_iter,
+        kind: StyleIterKind::Overlay,
         theme,
+        text,
     };
 
     let mut last_line_pos = LinePos {
diff --git a/helix-term/src/ui/editor.rs b/helix-term/src/ui/editor.rs
index 2aac3d335..7b130a38a 100644
--- a/helix-term/src/ui/editor.rs
+++ b/helix-term/src/ui/editor.rs
@@ -12,9 +12,7 @@
 
 use helix_core::{
     diagnostic::NumberOrString,
-    graphemes::{
-        ensure_grapheme_boundary_next_byte, next_grapheme_boundary, prev_grapheme_boundary,
-    },
+    graphemes::{next_grapheme_boundary, prev_grapheme_boundary},
     movement::Direction,
     syntax::{self, HighlightEvent},
     text_annotations::TextAnnotations,
@@ -315,26 +313,14 @@ pub fn doc_syntax_highlights<'doc>(
                 let iter = syntax
                     // TODO: range doesn't actually restrict source, just highlight range
                     .highlight_iter(text.slice(..), Some(range), None)
-                    .map(|event| event.unwrap())
-                    .map(move |event| match event {
-                        // TODO: use byte slices directly
-                        // convert byte offsets to char offset
-                        HighlightEvent::Source { start, end } => {
-                            let start =
-                                text.byte_to_char(ensure_grapheme_boundary_next_byte(text, start));
-                            let end =
-                                text.byte_to_char(ensure_grapheme_boundary_next_byte(text, end));
-                            HighlightEvent::Source { start, end }
-                        }
-                        event => event,
-                    });
+                    .map(|event| event.unwrap());
 
                 Box::new(iter)
             }
             None => Box::new(
                 [HighlightEvent::Source {
-                    start: text.byte_to_char(range.start),
-                    end: text.byte_to_char(range.end),
+                    start: range.start,
+                    end: range.end,
                 }]
                 .into_iter(),
             ),