From 1047f06de60147b0890d68afb6911d83971fe0f5 Mon Sep 17 00:00:00 2001
From: Shun-ichi Goto <shunichi.goto@gmail.com>
Date: Thu, 2 Apr 2026 16:31:13 +0900
Subject: [PATCH] feature: better word division for highlighting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each line is divided into several chunks to highlight the changes.

The previous implementation splits text at a fixed set of delimiter
characters (spaces, tabs, and common ASCII symbols such as `+-*/=!,;`).
Non-delimiter characters — including CJK ideographs, Hiragana, and
Katakana — are never treated as boundaries, so they tend to form large,
coarse chunks in languages like Japanese or Chinese that do not use
spaces to separate words. A small change within such text causes the
entire surrounding phrase to be highlighted.

This new implementation classifies each character into one of three
categories and groups consecutive characters of the same category into
one chunk, except for the Other category which is always split
character by character:

- Letter (Unicode Ll/Lu/Lt/Lm + digits): ASCII letters, digits, and
  letters with diacritics such as é, ü, ß, ñ, ё. Consecutive Letter
  characters form one chunk, keeping European words intact.
- OtherLetter (Unicode Lo): CJK, Hiragana, Katakana, Hangul, Thai,
  Arabic, Hebrew, etc. Consecutive OtherLetter characters form one
  chunk. CJK punctuation (。、「」…) falls into the Other category
  and therefore acts as a natural boundary between chunks.
- Other (default): whitespace, control characters, punctuation, and
  symbols. This category corresponds to the delimiter characters of
  the previous implementation. Each character is always its own chunk,
  preserving the same per-character precision as before for operators,
  spaces, and punctuation.

Category values for all 65,536 char values are pre-computed into a
static read-only array at startup for lock-free O(1) lookup.
---
 src/Models/TextInlineChange.cs | 55 +++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 8 deletions(-)
diff --git a/src/Models/TextInlineChange.cs b/src/Models/TextInlineChange.cs
index bc1873e25..e49a6d032 100644
--- a/src/Models/TextInlineChange.cs
+++ b/src/Models/TextInlineChange.cs
@@ -1,4 +1,5 @@
 ﻿using System.Collections.Generic;
+using System.Globalization;
 
 namespace SourceGit.Models
 {
@@ -26,6 +27,13 @@ private enum Edit
             AddedLeft,
         }
 
+        private enum CharCategory : byte
+        {
+            Other,       // default: whitespace, control, punctuation, symbols, etc.
+            Letter,      // Ll/Lu/Lt/Lm + digit: ASCII and euro letters (latin, greek, cyrillic, etc.)
+            OtherLetter, // Lo: CJK, hiragana, katakana, hangul, Thai, Arabic, etc.
+        }
+
         private class EditResult
         {
             public Edit State;
@@ -100,22 +108,25 @@ private static List<Chunk> MakeChunks(Dictionary<string, int> hashes, string tex
             var start = 0;
             var size = text.Length;
             var chunks = new List<Chunk>();
-            var delims = new HashSet<char>(" \t+-*/=!,:;.'\"/?|&#@%`<>()[]{}\\".ToCharArray());
+            if (size == 0)
+                return chunks;
 
-            for (int i = 0; i < size; i++)
+            var prev = GetCategory(text[0]);
+
+            for (var i = 1; i < size; i++)
             {
                 var ch = text[i];
-                if (delims.Contains(ch))
+                var category = GetCategory(ch);
+                if (prev != category || category == CharCategory.Other)
                 {
-                    if (start != i)
-                        AddChunk(chunks, hashes, text.Substring(start, i - start), start);
-                    AddChunk(chunks, hashes, text.Substring(i, 1), i);
-                    start = i + 1;
+                    AddChunk(chunks, hashes, text[start..i], start);
+                    start = i;
                 }
+                prev = category;
             }
 
             if (start < size)
-                AddChunk(chunks, hashes, text.Substring(start), start);
+                AddChunk(chunks, hashes, text[start..], start);
             return chunks;
         }
 
@@ -302,5 +313,33 @@ private static void AddChunk(List<Chunk> chunks, Dictionary<string, int> hashes,
             }
             chunks.Add(new Chunk(hash, start, data.Length));
         }
+
+        private static CharCategory[] BuildCategoryCache()
+        {
+            // Pre-compute category for all char values.
+            // All entries default to Other (0).
+            var cache = new CharCategory[65536];
+            for (int i = 0; i < 65536; i++)
+            {
+                var ch = (char)i;
+                // Unicode Lo: CJK, hiragana, katakana, hangul, Thai, Arabic, Hebrew, etc.
+                // → group consecutive chars into one chunk (no space delimiter in these languages)
+                if (char.GetUnicodeCategory(ch) == UnicodeCategory.OtherLetter)
+                    cache[i] = CharCategory.OtherLetter;
+
+                // Unicode Ll/Lu/Lt/Lm + digit: latin, greek, cyrillic and their diacritic variants
+                // → group consecutive chars into one chunk (words in space-delimited languages)
+                else if (char.IsLetterOrDigit(ch))
+                    cache[i] = CharCategory.Letter;
+
+                // everything else (whitespace, control, punctuation, symbols) → Other (default)
+            }
+
+            return cache;
+        }
+
+        private static CharCategory GetCategory(char ch) => s_charCategoryCache[ch];
+
+        private static readonly CharCategory[] s_charCategoryCache = BuildCategoryCache();
     }
 }