edo

Experimental text editor.
Log | Files | Refs | LICENSE

commit 1abe3bee69f88fe171dc46eb86f9c81049b69632
parent 420caa07a66b197ee51017a96eba8ae98f099c11
Author: Claudio Alessi <smoppy@gmail.com>
Date:   Sat, 17 Jan 2026 19:00:53 +0100

Improve handling of non-spacing characters.

UTF-8 code points are rendered using their hex values with the exception
of combining code points, identified by the utf8comp library which is
now a dependency.

Diffstat:
Mconfig.mk | 2+-
Mtui.c | 62++++++++++++++++++++++++++++++++++++++------------------------
Mutf8.c | 19++++++++++++++-----
Mutf8.h | 1+
4 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/config.mk b/config.mk @@ -8,7 +8,7 @@ MANPREFIX = ${PREFIX}/share/man # flags CPPFLAGS = -D_DEFAULT_SOURCE -DVERSION=\"${VERSION}\" CFLAGS = -std=c99 -g -pedantic -Wall -O0 ${CPPFLAGS} -LDFLAGS = -lgrapheme +LDFLAGS = -lgrapheme -lutf8proc #CFLAGS = -std=c99 -pedantic -Wall -Wno-deprecated-declarations -Os ${CPPFLAGS} # compiler and linker diff --git a/tui.c b/tui.c @@ -124,6 +124,18 @@ tui_frame_flush(void) { ab_flush(&frame); } +/* XXX move to utils? */ +int +hexlen(unsigned int n) { + int len = 0; + + do { + len++; + n >>= 4; + } while(n > 0); + return len; +} + int tui_text_width(char *s, int len, int x) { int tabstop = 8; @@ -140,22 +152,24 @@ tui_text_width(char *s, int len, int x) { wc = -1; if(compat_mode) { - if(cp == 0x200D) wc = 6; - else if(IS_RIS(cp)) wc = 2; + if(IS_RIS(cp)) wc = 2; } else { /* force 2 cells width for emoji followed by VS16 */ int nxi = i + step; if(nxi < len) { unsigned int nxcp; utf8_decode(s + nxi, len - nxi, &nxcp); - if(nxcp == 0xFE0F && ((cp >= 0x203C && cp <= 0x3299) || cp >= 0x1F000)) wc = 2; } } - if(wc == -1) wc = wcwidth(cp); + if(wc < 0) wc = wcwidth(cp); + assert(wc != -1); + if(wc > 0) w += wc; + else if(!utf8_is_combining(cp)) w += hexlen(cp) + 2; /* 2 for < and > */ + //else w += hexlen(cp) + 2; /* 2 for < and > */ } return w; } @@ -200,35 +214,39 @@ tui_draw_line_compat(UI *ui, int x, int y, Cell *cells, int count) { int step = utf8_decode(txt + o, cells[i].len - o, &cp); int cw = cells[i].width; int neederase = cells[i].len > 1 && (cells[i].width == 1 || IS_RIS(cp)); - char tag[] = "<200d>"; - int tagw = 6; - int offset = 0; switch(cp) { - case 0x200D: - /* TODO: */ - if(cells[i].flags & CELL_TRUNC_L) { - offset = tagw - cells[i].width; - if(offset < 0) offset = 0; - } - int len_to_print = cells[i].width; - assert(offset + len_to_print <= tagw); - - if(len_to_print > 0) ab_write(&frame, tag + offset, len_to_print); - break; case '\t': for(int t = 0; t < cells[i].width; t++) ab_write(&frame, " ", 1); break; default: - if(cells[i].flags & CELL_TRUNC_L) { + cw = wcwidth(cp); + if(cw < 0) break; + + if(!cw) { if(!cells[i].width) break; + + char tag[16]; + snprintf(tag, sizeof tag, "<%0x>", cp); + + if(cells[i].flags & CELL_TRUNC_L) { + cw = tui_text_width(txt + o, cells[i].len, 0); + o = cw - cells[i].width; + if(o < 0) o = 0; + } + { const char t[] = ESC"[48;5;233m"; ab_write(&frame, t, sizeof t - 1); } + cw = ab_printf(&frame, "%.*s", cells[i].width, tag + o); + { const char t[] = ESC"[0m"; ab_write(&frame, t, sizeof t - 1); } + break; + } + + if(cells[i].flags & CELL_TRUNC_L) { ab_write(&frame, "<", 1); cw = 1; break; } if(cells[i].flags & CELL_TRUNC_R) { - if(!cells[i].width) break; ab_write(&frame, ">", 1); cw = 1; break; @@ -239,10 +257,6 @@ tui_draw_line_compat(UI *ui, int x, int y, Cell *cells, int count) { ab_write(&frame, t, sizeof t - 1); } - /* don't expect negative values here */ - cw = wcwidth(cp); - if(cw < 0) cw = 0; - ab_write(&frame, txt + o, step); /* to preserve coherence between terminals always split RIS diff --git a/utf8.c b/utf8.c @@ -1,10 +1,9 @@ #define _XOPEN_SOURCE #include <wchar.h> #include <grapheme.h> +#include <utf8proc.h> -int utf8_len(char *buf, int len); -int utf8_decode(char *buf, int len, unsigned int *cp); -size_t utf8_len_compat(char *buf, int len); +#include "utf8.h" size_t utf8_len_compat(char *buf, int len) { @@ -12,10 +11,12 @@ utf8_len_compat(char *buf, int len) { int step, i; i = step = utf8_decode(buf, len, &cp); - if(wcwidth(cp) < 0) return step; + //if(wcwidth(cp) >= 0) return step; + if(!utf8_is_combining(cp) && wcwidth(cp) < 0) return step; while(i < len) { step = utf8_decode(buf + i, len - i, &cp); - if(cp == 0x200D || wcwidth(cp)) break; + //if(wcwidth(cp) >= 0) break; + if(!utf8_is_combining(cp) || wcwidth(cp)) break; i += step; } return i; @@ -30,3 +31,11 @@ int utf8_decode(char *buf, int len, unsigned int *cp) { return grapheme_decode_utf8(buf, len, cp); } + +int +utf8_is_combining(unsigned int cp) { + const utf8proc_property_t *prop = utf8proc_get_property(cp); + + return (prop->category == UTF8PROC_CATEGORY_MN + || prop->category == UTF8PROC_CATEGORY_ME); +} diff --git a/utf8.h b/utf8.h @@ -9,3 +9,4 @@ int utf8_len(char *buf, int len); size_t utf8_len_compat(char *buf, int len); int utf8_decode(char *buf, int len, unsigned int *cp); +int utf8_is_combining(unsigned int cp);