From 1047f06de60147b0890d68afb6911d83971fe0f5 Mon Sep 17 00:00:00 2001 From: Shun-ichi Goto Date: Thu, 2 Apr 2026 16:31:13 +0900 Subject: [PATCH] feature: better word division for highlighting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each line is divided into several chunks to highlight the changes. The previous implementation splits text at a fixed set of delimiter characters (spaces, tabs, and common ASCII symbols such as `+-*/=!,;`). Non-delimiter characters — including CJK ideographs, Hiragana, and Katakana — are never treated as boundaries, so they tend to form large, coarse chunks in languages like Japanese or Chinese that do not use spaces to separate words. A small change within such text causes the entire surrounding phrase to be highlighted. This new implementation classifies each character into one of three categories and groups consecutive characters of the same category into one chunk, except for the Other category which is always split character by character: - Letter (Unicode Ll/Lu/Lt/Lm + digits): ASCII letters, digits, and letters with diacritics such as é, ü, ß, ñ, ё. Consecutive Letter characters form one chunk, keeping European words intact. - OtherLetter (Unicode Lo): CJK, Hiragana, Katakana, Hangul, Thai, Arabic, Hebrew, etc. Consecutive OtherLetter characters form one chunk. CJK punctuation (。、「」…) falls into the Other category and therefore acts as a natural boundary between chunks. - Other (default): whitespace, control characters, punctuation, and symbols. This category corresponds to the delimiter characters of the previous implementation. Each character is always its own chunk, preserving the same per-character precision as before for operators, spaces, and punctuation. Category values for all 65,536 char values are pre-computed into a static read-only array at startup for lock-free O(1) lookup. --- src/Models/TextInlineChange.cs | 55 +++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/src/Models/TextInlineChange.cs b/src/Models/TextInlineChange.cs index bc1873e25..e49a6d032 100644 --- a/src/Models/TextInlineChange.cs +++ b/src/Models/TextInlineChange.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using System.Globalization; namespace SourceGit.Models { @@ -26,6 +27,13 @@ private enum Edit AddedLeft, } + private enum CharCategory : byte + { + Other, // default: whitespace, control, punctuation, symbols, etc. + Letter, // Ll/Lu/Lt/Lm + digit: ASCII and euro letters (latin, greek, cyrillic, etc.) + OtherLetter, // Lo: CJK, hiragana, katakana, hangul, Thai, Arabic, etc. + } + private class EditResult { public Edit State; @@ -100,22 +108,25 @@ private static List MakeChunks(Dictionary hashes, string tex var start = 0; var size = text.Length; var chunks = new List(); - var delims = new HashSet(" \t+-*/=!,:;.'\"/?|&#@%`<>()[]{}\\".ToCharArray()); + if (size == 0) + return chunks; - for (int i = 0; i < size; i++) + var prev = GetCategory(text[0]); + + for (var i = 1; i < size; i++) { var ch = text[i]; - if (delims.Contains(ch)) + var category = GetCategory(ch); + if (prev != category || category == CharCategory.Other) { - if (start != i) - AddChunk(chunks, hashes, text.Substring(start, i - start), start); - AddChunk(chunks, hashes, text.Substring(i, 1), i); - start = i + 1; + AddChunk(chunks, hashes, text[start..i], start); + start = i; } + prev = category; } if (start < size) - AddChunk(chunks, hashes, text.Substring(start), start); + AddChunk(chunks, hashes, text[start..], start); return chunks; } @@ -302,5 +313,33 @@ private static void AddChunk(List chunks, Dictionary hashes, } chunks.Add(new Chunk(hash, start, data.Length)); } + + private static CharCategory[] BuildCategoryCache() + { + // Pre-compute category for all char values. + // All entries default to Other (0). + var cache = new CharCategory[65536]; + for (int i = 0; i < 65536; i++) + { + var ch = (char)i; + // Unicode Lo: CJK, hiragana, katakana, hangul, Thai, Arabic, Hebrew, etc. + // → group consecutive chars into one chunk (no space delimiter in these languages) + if (char.GetUnicodeCategory(ch) == UnicodeCategory.OtherLetter) + cache[i] = CharCategory.OtherLetter; + + // Unicode Ll/Lu/Lt/Lm + digit: latin, greek, cyrillic and their diacritic variants + // → group consecutive chars into one chunk (words in space-delimited languages) + else if (char.IsLetterOrDigit(ch)) + cache[i] = CharCategory.Letter; + + // everything else (whitespace, control, punctuation, symbols) → Other (default) + } + + return cache; + } + + private static CharCategory GetCategory(char ch) => s_charCategoryCache[ch]; + + private static readonly CharCategory[] s_charCategoryCache = BuildCategoryCache(); } }