Work on table based lookup for grapheme segmentation

2026-05-29 11:18:50 +02:00 · 2025-03-28 15:06:48 +05:30
parent 56b49c49fb
commit 01cdfcd002
1 changed files with 197 additions and 35 deletions
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

+# Imports {{{
 import json
 import os
 import re
@@ -16,11 +17,13 @@ from math import ceil, log
 from typing import (
    Callable,
    DefaultDict,
+    Iterator,
    Literal,
    NamedTuple,
    Optional,
    Protocol,
    Sequence,
+    TypedDict,
    Union,
 )
 from urllib.request import urlopen
@@ -29,8 +32,9 @@ if __name__ == '__main__' and not __package__:
    import __main__
    __main__.__package__ = 'gen'
    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# }}}

-
+# Fetching data {{{
 non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000))
 non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000))
 non_characters |= frozenset(range(0xfdd0, 0xfdf0))
@@ -63,8 +67,9 @@ def unicode_version() -> tuple[int, int, int]:
        if m is not None:
            return int(m.group(1)), int(m.group(2)), int(m.group(3))
    raise ValueError('Could not find Unicode Version')
+# }}}

-
+# Parsing Unicode databases {{{
 # Map of class names to set of codepoints in class
 class_maps: dict[str, set[int]] = {}
 all_symbols: set[int] = set()
@@ -269,9 +274,13 @@ def parse_eaw() -> None:

 def parse_grapheme_segmentation() -> None:
    global extended_pictographic
+    grapheme_segmentation_maps['AtStart']  # this is used by the segmentation algorithm, no character has it
+    grapheme_segmentation_maps['None']  # this is used by the segmentation algorithm, no character has it
    for line in get_data('ucd/auxiliary/GraphemeBreakProperty.txt'):
        chars, category = split_two(line)
        grapheme_segmentation_maps[category] |= chars
+    grapheme_segmentation_maps['Private_Expecting_RI']  # this is used by the segmentation algorithm, no character has it
+    incb_map['None']   # used by segmentation algorithm no character has it
    for line in get_data('ucd/DerivedCoreProperties.txt'):
        spec, rest = line.split(';', 1)
        category = rest.strip().split(' ', 1)[0].strip().rstrip(';')
@@ -287,6 +296,36 @@ def parse_grapheme_segmentation() -> None:
            extended_pictographic |= chars


+class GraphemeSegmentationTest(TypedDict):
+    data: tuple[str, ...]
+    comment: str
+
+
+grapheme_segmentation_tests: list[GraphemeSegmentationTest] = []
+
+
+def parse_test_data() -> None:
+    for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
+        t, comment = line.split('#')
+        t = t.lstrip('÷').strip().rstrip('÷').strip()
+        chars: list[list[str]] = [[]]
+        for x in re.split(r'([÷×])', t):
+            x = x.strip()
+            match x:
+                case '÷':
+                    chars.append([])
+                case '×':
+                    pass
+                case '':
+                    pass
+                case _:
+                    ch = chr(int(x, 16))
+                    chars[-1].append(ch)
+        c = tuple(''.join(c) for c in chars)
+        grapheme_segmentation_tests.append({'data': c, 'comment': comment.strip()})
+# }}}
+
+
 def write_case(spec: Union[tuple[int, ...], int], p: Callable[..., None], for_go: bool = False) -> None:
    if isinstance(spec, tuple):
        if for_go:
@@ -386,27 +425,8 @@ def gen_rowcolumn_diacritics() -> None:


 def gen_test_data() -> None:
-    tests = []
-    for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
-        t, comment = line.split('#')
-        t = t.lstrip('÷').strip().rstrip('÷').strip()
-        chars: list[list[str]] = [[]]
-        for x in re.split(r'([÷×])', t):
-            x = x.strip()
-            match x:
-                case '÷':
-                    chars.append([])
-                case '×':
-                    pass
-                case '':
-                    pass
-                case _:
-                    ch = chr(int(x, 16))
-                    chars[-1].append(ch)
-        c = [''.join(c) for c in chars]
-        tests.append({'data': c, 'comment': comment.strip()})
    with open('kitty_tests/GraphemeBreakTest.json', 'wb') as f:
-        f.write(json.dumps(tests, indent=2, ensure_ascii=False).encode())
+        f.write(json.dumps(grapheme_segmentation_tests, indent=2, ensure_ascii=False).encode())


 def getsize(data: Iterable[int]) -> Literal[1, 2, 4]:
@@ -519,6 +539,148 @@ def bitsize(maxval: int) -> int:  # number of bits needed to store maxval
    return ceil(log(maxval, 2))


+def clamped_bitsize(val: int) -> int:
+    if val <= 8:
+        return 8
+    if val <= 16:
+        return 16
+    if val <= 32:
+        return 32
+    if val <= 64:
+        return 64
+    raise ValueError('Too many fields')
+
+
+class GraphemeSegmentationProps(NamedTuple):
+
+    grapheme_break: str = ''  # set at runtime
+    indic_conjunct_break: str = ''  # set at runtime
+    is_extended_pictographic: bool = True
+
+    @classmethod
+    def bitsize(cls) -> int:
+        ans = sum(int(cls._field_defaults[f]) for f in cls._fields)
+        return clamped_bitsize(ans)
+
+
+control_grapheme_breaks = 'CR', 'LF', 'Control'
+linker_or_extend = 'Linker', 'Extend'
+
+
+class GraphemeSegmentationState(NamedTuple):
+    grapheme_break: str = ''  # set at runtime
+    # True if the last character ends a sequence of Indic_Conjunct_Break values:  consonant {extend|linker}*
+    incb_consonant_extended: bool = True
+    # True if the last character ends a sequence of Indic_Conjunct_Break values:  consonant {extend|linker}* linker
+    incb_consonant_extended_linker: bool = True
+    # True if the last character ends a sequence of Indic_Conjunct_Break values:  consonant {extend|linker}* linker {extend|linker}*
+    incb_consonant_extended_linker_extended: bool = True
+    # True if the last character ends an emoji modifier sequence \p{Extended_Pictographic} Extend*
+    emoji_modifier_sequence: bool = True
+    # True if the last character was immediately preceded by an emoji modifier sequence   \p{Extended_Pictographic} Extend*
+    emoji_modifier_sequence_before_last_char: bool = True
+
+    @classmethod
+    def make(cls) -> 'GraphemeSegmentationState':
+        return GraphemeSegmentationState('AtStart', False, False, False, False, False)
+
+    @classmethod
+    def bitsize(cls) -> int:
+        ans = sum(int(cls._field_defaults[f]) for f in cls._fields)
+        return clamped_bitsize(ans)
+
+    def add_to_current_cell(self, p: GraphemeSegmentationProps) -> 'GraphemeSegmentationResult':
+        prev = self.grapheme_break
+        prop = p.grapheme_break
+        incb = p.indic_conjunct_break
+        add_to_cell = False
+        if self.grapheme_break == 'AtStart':
+            add_to_cell = True
+            if prop == 'Regional_Indicator':
+                prop = 'Private_Expecting_RI'
+        else:
+            # No break between CR and LF (GB3).
+            if prev == 'CR' and prop == 'LF':
+                add_to_cell = True
+            # Break before and after controls (GB4, GB5).
+            elif prev in control_grapheme_breaks or prop in control_grapheme_breaks:
+                pass
+            # No break between Hangul syllable sequences (GB6, GB7, GB8).
+            elif (
+                (prev == 'L' and prop in ('L', 'V', 'LV', 'LVT')) or
+                (prev in ('LV', 'V') and prop in ('V', 'T')) or
+                (prev in ('LVT', 'T') and prop == 'T')
+            ):
+                add_to_cell = True
+            # No break before: extending characters or ZWJ (GB9), SpacingMarks (GB9a), Prepend characters (GB9b).
+            elif prop in ('Extend', 'ZWJ', 'SpacingMark') or prev in 'Prepend':
+                add_to_cell = True
+            # No break within certain combinations of Indic_Conjunct_Break values
+            # Between consonant {extend|linker}* linker {extend|linker}* and consonant (GB9c).
+            elif self.incb_consonant_extended_linker_extended and incb == 'Consonant':
+                add_to_cell = True
+            # No break within emoji modifier sequences or emoji zwj sequences (GB11).
+            elif prev == 'ZWJ' and self.emoji_modifier_sequence_before_last_char and p.is_extended_pictographic:
+                add_to_cell = True
+            # No break between RI if there is an odd number of RI characters before (GB12, GB13).
+            elif prop == 'Regional_Indicator':
+                if prev == 'Private_Expecting_RI':
+                    add_to_cell = True
+                else:
+                    prop = 'Private_Expecting_RI'
+            # Break everywhere else GB999
+
+        incb_consonant_extended_linker = self.incb_consonant_extended and incb == 'Linker'
+        incb_consonant_extended_linker_extended = incb_consonant_extended_linker or (
+                self.incb_consonant_extended_linker_extended and incb in linker_or_extend)
+        incb_consonant_extended = incb == 'Consonant' or (
+            self.incb_consonant_extended and incb in linker_or_extend)
+        emoji_modifier_sequence_before_last_char = self.emoji_modifier_sequence
+        emoji_modifier_sequence = (self.emoji_modifier_sequence and prop == 'Extend') or p.is_extended_pictographic
+
+        return GraphemeSegmentationResult(GraphemeSegmentationState(
+            grapheme_break=prop, incb_consonant_extended=incb_consonant_extended,
+            incb_consonant_extended_linker=incb_consonant_extended_linker,
+            incb_consonant_extended_linker_extended=incb_consonant_extended_linker_extended,
+            emoji_modifier_sequence=emoji_modifier_sequence, emoji_modifier_sequence_before_last_char=emoji_modifier_sequence_before_last_char
+        ), add_to_cell)
+
+
+def split_into_graphemes(text: str, props: Sequence[GraphemeSegmentationProps]) -> Iterator[str]:
+    s = GraphemeSegmentationState.make()
+    pos = 0
+    for i, ch in enumerate(text):
+        p = props[ord(ch)]
+        s, add_to_cell = s.add_to_current_cell(p)
+        if not add_to_cell:
+            yield text[pos:i]
+            pos = i
+    if pos < len(text):
+        yield text[pos:]
+
+
+def test_grapheme_segmentation(props: Sequence[GraphemeSegmentationProps]) -> None:
+    for test in grapheme_segmentation_tests:
+        expected = test['data']
+        actual = tuple(split_into_graphemes(''.join(test['data']), props))
+        if expected != actual:
+            def as_codepoints(text: str) -> str:
+                return ' '.join(hex(ord(x))[2:] for x in text)
+            qe = tuple(map(as_codepoints, expected))
+            qa = tuple(map(as_codepoints, actual))
+            raise SystemExit(f'Failed to split graphemes for: {test["comment"]}\n{expected!r} {qe} != {actual!r} {qa}')
+
+
+class GraphemeSegmentationKey(NamedTuple):
+    state: GraphemeSegmentationState
+    next_char: GraphemeSegmentationProps
+
+
+class GraphemeSegmentationResult(NamedTuple):
+    new_state: GraphemeSegmentationState
+    add_to_current_cell: bool
+
+
 class CharProps(NamedTuple):

    width: int = 3
@@ -540,15 +702,7 @@ class CharProps(NamedTuple):
    @classmethod
    def bitsize(cls) -> int:
        ans = sum(int(cls._field_defaults[f]) for f in cls._fields)
-        if ans <= 8:
-            return 8
-        if ans <= 16:
-            return 16
-        if ans <= 32:
-            return 32
-        if ans <= 64:
-            return 64
-        raise ValueError('Too many fields')
+        return clamped_bitsize(ans)

    @property
    def go_fields(self) -> Iterable[str]:
@@ -650,9 +804,12 @@ def top_level_category(q: str) -> set[int]:


 def gen_char_props() -> None:
-    CharProps._field_defaults['grapheme_break'] = str(bitsize(len(grapheme_segmentation_maps) + 2))
-    CharProps._field_defaults['indic_conjunct_break'] = str(bitsize(len(incb_map) + 1))
+    CharProps._field_defaults['grapheme_break'] = str(bitsize(len(grapheme_segmentation_maps)))
+    CharProps._field_defaults['indic_conjunct_break'] = str(bitsize(len(incb_map)))
    CharProps._field_defaults['category'] = str(bitsize(len(class_maps) + 1))
+    GraphemeSegmentationProps._field_defaults['grapheme_break'] = CharProps._field_defaults['grapheme_break']
+    GraphemeSegmentationProps._field_defaults['indic_conjunct_break'] = CharProps._field_defaults['indic_conjunct_break']
+    GraphemeSegmentationState._field_defaults['grapheme_break'] = GraphemeSegmentationProps._field_defaults['grapheme_break']
    invalid = class_maps['Cc'] | class_maps['Cs']
    non_printing = invalid | class_maps['Cf']
    is_word_char = top_level_category('LN')
@@ -691,6 +848,10 @@ def gen_char_props() -> None:
            is_combining_char=ch in marks, category=cat_map.get(ch, 'Cn'), is_word_char=ch in is_word_char,
            is_punctuation=ch in is_punctuation,
        ) for ch in range(sys.maxunicode + 1))
+    gsprops = tuple(GraphemeSegmentationProps(
+        grapheme_break=x.grapheme_break, indic_conjunct_break=x.indic_conjunct_break,
+        is_extended_pictographic=x.is_extended_pictographic) for x in prop_array)
+    test_grapheme_segmentation(gsprops)
    t1, t2, t3, shift, mask, bytesz = splitbins(prop_array, CharProps.bitsize() // 8)
    print(f'Size of character properties table: {bytesz/1024:.1f}KB')

@@ -700,8 +861,8 @@ def gen_char_props() -> None:
    with create_header('kitty/char-props-data.h', include_data_types=False) as c, open('tools/wcswidth/char-props-data.go', 'w') as gof:
        gp = partial(print, file=gof)
        gp('package wcswidth')
-        generate_enum(c, gp, 'GraphemeBreakProperty', 'AtStart', 'None', *grapheme_segmentation_maps, prefix='GBP_')
-        generate_enum(c, gp, 'IndicConjunctBreak', 'None', *incb_map, prefix='ICB_')
+        generate_enum(c, gp, 'GraphemeBreakProperty', *grapheme_segmentation_maps, prefix='GBP_')
+        generate_enum(c, gp, 'IndicConjunctBreak', *incb_map, prefix='ICB_')
        cen('// UCBDeclaration')
        generate_enum(cen, gp, 'UnicodeCategory', 'Cn', *class_maps, prefix='UC_')
        cen('// EndUCBDeclaration')
@@ -730,6 +891,7 @@ def main(args: list[str]=sys.argv) -> None:
    parse_emoji()
    parse_eaw()
    parse_grapheme_segmentation()
+    parse_test_data()
    gen_names()
    gen_rowcolumn_diacritics()
    gen_test_data()