From 73d26d0d97bfc684266b5c8180d00783d6d84eba Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Wed, 10 Apr 2024 17:14:08 +0200 Subject: [PATCH] don't manually grapheme align ts highlights (#10310) --- helix-core/src/graphemes.rs | 17 ---------- helix-stdx/src/rope.rs | 62 +++++++++++++++++++++++++++++++++++ helix-term/src/ui/document.rs | 23 ++++++++++++- helix-term/src/ui/editor.rs | 22 +++---------- 4 files changed, 88 insertions(+), 36 deletions(-) diff --git a/helix-core/src/graphemes.rs b/helix-core/src/graphemes.rs index 7cb5cd062..a02d6e4dd 100644 --- a/helix-core/src/graphemes.rs +++ b/helix-core/src/graphemes.rs @@ -278,23 +278,6 @@ pub fn ensure_grapheme_boundary_prev(slice: RopeSlice, char_idx: usize) -> usize } } -/// Returns the passed byte index if it's already a grapheme boundary, -/// or the next grapheme boundary byte index if not. -#[must_use] -#[inline] -pub fn ensure_grapheme_boundary_next_byte(slice: RopeSlice, byte_idx: usize) -> usize { - if byte_idx == 0 { - byte_idx - } else { - // TODO: optimize so we're not constructing grapheme cursor twice - if is_grapheme_boundary_byte(slice, byte_idx) { - byte_idx - } else { - next_grapheme_boundary_byte(slice, byte_idx) - } - } -} - /// Returns whether the given char position is a grapheme boundary. #[must_use] pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool { diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs index 7e2549f5a..2695555e3 100644 --- a/helix-stdx/src/rope.rs +++ b/helix-stdx/src/rope.rs @@ -3,6 +3,7 @@ pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex}; pub use regex_cursor::regex_automata::util::syntax::Config; use regex_cursor::{Input as RegexInput, RopeyCursor}; +use ropey::str_utils::byte_to_char_idx; use ropey::RopeSlice; pub trait RopeSliceExt<'a>: Sized { @@ -16,6 +17,23 @@ fn regex_input_at_bytes>( fn regex_input_at>(self, char_range: R) -> RegexInput>; fn first_non_whitespace_char(self) -> Option; fn last_non_whitespace_char(self) -> Option; + /// returns the char idx of `byte_idx`, if `byte_idx` is a char boundary + /// this function behaves the same as `byte_to_char` but if `byte_idx` is + /// not a valid char boundary (so within a char) this will return the next + /// char index. + /// + /// # Example + /// + /// ``` + /// # use ropey::RopeSlice; + /// # use helix_stdx::rope::RopeSliceExt; + /// let text = RopeSlice::from("😆"); + /// for i in 1..text.len_bytes() { + /// assert_eq!(text.byte_to_char(i), 0); + /// assert_eq!(text.byte_to_next_char(i), 1); + /// } + /// ``` + fn byte_to_next_char(self, byte_idx: usize) -> usize; } impl<'a> RopeSliceExt<'a> for RopeSlice<'a> { @@ -75,4 +93,48 @@ fn last_non_whitespace_char(self) -> Option { .position(|ch| !ch.is_whitespace()) .map(|pos| self.len_chars() - pos - 1) } + + /// returns the char idx of `byte_idx`, if `byte_idx` is + /// a char boundary this function behaves the same as `byte_to_char` + fn byte_to_next_char(self, mut byte_idx: usize) -> usize { + let (chunk, chunk_byte_off, chunk_char_off, _) = self.chunk_at_byte(byte_idx); + byte_idx -= chunk_byte_off; + let is_char_boundary = + is_utf8_char_boundary(chunk.as_bytes().get(byte_idx).copied().unwrap_or(0)); + chunk_char_off + byte_to_char_idx(chunk, byte_idx) + !is_char_boundary as usize + } +} + +// copied from std +#[inline] +const fn is_utf8_char_boundary(b: u8) -> bool { + // This is bit magic equivalent to: b < 128 || b >= 192 + (b as i8) >= -0x40 +} + +#[cfg(test)] +mod tests { + use ropey::RopeSlice; + + use crate::rope::RopeSliceExt; + + #[test] + fn next_char_at_byte() { + for i in 0..=6 { + assert_eq!(RopeSlice::from("foobar").byte_to_next_char(i), i); + } + for char_idx in 0..10 { + let len = "😆".len(); + assert_eq!( + RopeSlice::from("😆😆😆😆😆😆😆😆😆😆").byte_to_next_char(char_idx * len), + char_idx + ); + for i in 1..=len { + assert_eq!( + RopeSlice::from("😆😆😆😆😆😆😆😆😆😆").byte_to_next_char(char_idx * len + i), + char_idx + 1 + ); + } + } + } } diff --git a/helix-term/src/ui/document.rs b/helix-term/src/ui/document.rs index b571b83c2..bcbaa3519 100644 --- a/helix-term/src/ui/document.rs +++ b/helix-term/src/ui/document.rs @@ -7,6 +7,7 @@ use helix_core::syntax::HighlightEvent; use helix_core::text_annotations::TextAnnotations; use helix_core::{visual_offset_from_block, Position, RopeSlice}; +use helix_stdx::rope::RopeSliceExt; use helix_view::editor::{WhitespaceConfig, WhitespaceRenderValue}; use helix_view::graphics::Rect; use helix_view::theme::Style; @@ -32,14 +33,27 @@ fn render_background(&mut self, renderer: &mut TextRenderer, pos: LinePos) { } } +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +enum StyleIterKind { + /// base highlights (usually emitted by TS), byte indices (potentially not codepoint aligned) + BaseHighlights, + /// overlay highlights (emitted by custom code from selections), char indices + Overlay, +} + /// A wrapper around a HighlightIterator /// that merges the layered highlights to create the final text style /// and yields the active text style and the char_idx where the active /// style will have to be recomputed. +/// +/// TODO(ropey2): hopefully one day helix and ropey will operate entirely +/// on byte ranges and we can remove this struct StyleIter<'a, H: Iterator> { text_style: Style, active_highlights: Vec, highlight_iter: H, + kind: StyleIterKind, + text: RopeSlice<'a>, theme: &'a Theme, } @@ -54,7 +68,7 @@ fn next(&mut self) -> Option<(Style, usize)> { HighlightEvent::HighlightEnd => { self.active_highlights.pop(); } - HighlightEvent::Source { start, end } => { + HighlightEvent::Source { start, mut end } => { if start == end { continue; } @@ -64,6 +78,9 @@ fn next(&mut self) -> Option<(Style, usize)> { .fold(self.text_style, |acc, span| { acc.patch(self.theme.highlight(span.0)) }); + if self.kind == StyleIterKind::BaseHighlights { + end = self.text.byte_to_next_char(end); + } return Some((style, end)); } } @@ -185,13 +202,17 @@ pub fn render_text<'t>( text_style: renderer.text_style, active_highlights: Vec::with_capacity(64), highlight_iter: syntax_highlight_iter, + kind: StyleIterKind::BaseHighlights, theme, + text, }; let mut overlay_styles = StyleIter { text_style: Style::default(), active_highlights: Vec::with_capacity(64), highlight_iter: overlay_highlight_iter, + kind: StyleIterKind::Overlay, theme, + text, }; let mut last_line_pos = LinePos { diff --git a/helix-term/src/ui/editor.rs b/helix-term/src/ui/editor.rs index 2aac3d335..7b130a38a 100644 --- a/helix-term/src/ui/editor.rs +++ b/helix-term/src/ui/editor.rs @@ -12,9 +12,7 @@ use helix_core::{ diagnostic::NumberOrString, - graphemes::{ - ensure_grapheme_boundary_next_byte, next_grapheme_boundary, prev_grapheme_boundary, - }, + graphemes::{next_grapheme_boundary, prev_grapheme_boundary}, movement::Direction, syntax::{self, HighlightEvent}, text_annotations::TextAnnotations, @@ -315,26 +313,14 @@ pub fn doc_syntax_highlights<'doc>( let iter = syntax // TODO: range doesn't actually restrict source, just highlight range .highlight_iter(text.slice(..), Some(range), None) - .map(|event| event.unwrap()) - .map(move |event| match event { - // TODO: use byte slices directly - // convert byte offsets to char offset - HighlightEvent::Source { start, end } => { - let start = - text.byte_to_char(ensure_grapheme_boundary_next_byte(text, start)); - let end = - text.byte_to_char(ensure_grapheme_boundary_next_byte(text, end)); - HighlightEvent::Source { start, end } - } - event => event, - }); + .map(|event| event.unwrap()); Box::new(iter) } None => Box::new( [HighlightEvent::Source { - start: text.byte_to_char(range.start), - end: text.byte_to_char(range.end), + start: range.start, + end: range.end, }] .into_iter(), ),