commit 1abe3bee69f88fe171dc46eb86f9c81049b69632
parent 420caa07a66b197ee51017a96eba8ae98f099c11
Author: Claudio Alessi <smoppy@gmail.com>
Date: Sat, 17 Jan 2026 19:00:53 +0100
Improve handling of non-spacing characters.
UTF-8 code points are rendered using their hex values with the exception
of combining code points, identified by the utf8comp library which is
now a dependency.
Diffstat:
| M | config.mk | | | 2 | +- |
| M | tui.c | | | 62 | ++++++++++++++++++++++++++++++++++++++------------------------ |
| M | utf8.c | | | 19 | ++++++++++++++----- |
| M | utf8.h | | | 1 | + |
4 files changed, 54 insertions(+), 30 deletions(-)
diff --git a/config.mk b/config.mk
@@ -8,7 +8,7 @@ MANPREFIX = ${PREFIX}/share/man
# flags
CPPFLAGS = -D_DEFAULT_SOURCE -DVERSION=\"${VERSION}\"
CFLAGS = -std=c99 -g -pedantic -Wall -O0 ${CPPFLAGS}
-LDFLAGS = -lgrapheme
+LDFLAGS = -lgrapheme -lutf8proc
#CFLAGS = -std=c99 -pedantic -Wall -Wno-deprecated-declarations -Os ${CPPFLAGS}
# compiler and linker
diff --git a/tui.c b/tui.c
@@ -124,6 +124,18 @@ tui_frame_flush(void) {
ab_flush(&frame);
}
+/* XXX move to utils? */
+int
+hexlen(unsigned int n) {
+ int len = 0;
+
+ do {
+ len++;
+ n >>= 4;
+ } while(n > 0);
+ return len;
+}
+
int
tui_text_width(char *s, int len, int x) {
int tabstop = 8;
@@ -140,22 +152,24 @@ tui_text_width(char *s, int len, int x) {
wc = -1;
if(compat_mode) {
- if(cp == 0x200D) wc = 6;
- else if(IS_RIS(cp)) wc = 2;
+ if(IS_RIS(cp)) wc = 2;
} else {
/* force 2 cells width for emoji followed by VS16 */
int nxi = i + step;
if(nxi < len) {
unsigned int nxcp;
utf8_decode(s + nxi, len - nxi, &nxcp);
-
if(nxcp == 0xFE0F && ((cp >= 0x203C && cp <= 0x3299) || cp >= 0x1F000))
wc = 2;
}
}
- if(wc == -1) wc = wcwidth(cp);
+ if(wc < 0) wc = wcwidth(cp);
+ assert(wc != -1);
+
if(wc > 0) w += wc;
+ else if(!utf8_is_combining(cp)) w += hexlen(cp) + 2; /* 2 for < and > */
+ //else w += hexlen(cp) + 2; /* 2 for < and > */
}
return w;
}
@@ -200,35 +214,39 @@ tui_draw_line_compat(UI *ui, int x, int y, Cell *cells, int count) {
int step = utf8_decode(txt + o, cells[i].len - o, &cp);
int cw = cells[i].width;
int neederase = cells[i].len > 1 && (cells[i].width == 1 || IS_RIS(cp));
- char tag[] = "<200d>";
- int tagw = 6;
- int offset = 0;
switch(cp) {
- case 0x200D:
- /* TODO: */
- if(cells[i].flags & CELL_TRUNC_L) {
- offset = tagw - cells[i].width;
- if(offset < 0) offset = 0;
- }
- int len_to_print = cells[i].width;
- assert(offset + len_to_print <= tagw);
-
- if(len_to_print > 0) ab_write(&frame, tag + offset, len_to_print);
- break;
case '\t':
for(int t = 0; t < cells[i].width; t++)
ab_write(&frame, " ", 1);
break;
default:
- if(cells[i].flags & CELL_TRUNC_L) {
+ cw = wcwidth(cp);
+ if(cw < 0) break;
+
+ if(!cw) {
if(!cells[i].width) break;
+
+ char tag[16];
+ snprintf(tag, sizeof tag, "<%0x>", cp);
+
+ if(cells[i].flags & CELL_TRUNC_L) {
+ cw = tui_text_width(txt + o, cells[i].len, 0);
+ o = cw - cells[i].width;
+ if(o < 0) o = 0;
+ }
+ { const char t[] = ESC"[48;5;233m"; ab_write(&frame, t, sizeof t - 1); }
+ cw = ab_printf(&frame, "%.*s", cells[i].width, tag + o);
+ { const char t[] = ESC"[0m"; ab_write(&frame, t, sizeof t - 1); }
+ break;
+ }
+
+ if(cells[i].flags & CELL_TRUNC_L) {
ab_write(&frame, "<", 1);
cw = 1;
break;
}
if(cells[i].flags & CELL_TRUNC_R) {
- if(!cells[i].width) break;
ab_write(&frame, ">", 1);
cw = 1;
break;
@@ -239,10 +257,6 @@ tui_draw_line_compat(UI *ui, int x, int y, Cell *cells, int count) {
ab_write(&frame, t, sizeof t - 1);
}
- /* don't expect negative values here */
- cw = wcwidth(cp);
- if(cw < 0) cw = 0;
-
ab_write(&frame, txt + o, step);
/* to preserve coherence between terminals always split RIS
diff --git a/utf8.c b/utf8.c
@@ -1,10 +1,9 @@
#define _XOPEN_SOURCE
#include <wchar.h>
#include <grapheme.h>
+#include <utf8proc.h>
-int utf8_len(char *buf, int len);
-int utf8_decode(char *buf, int len, unsigned int *cp);
-size_t utf8_len_compat(char *buf, int len);
+#include "utf8.h"
size_t
utf8_len_compat(char *buf, int len) {
@@ -12,10 +11,12 @@ utf8_len_compat(char *buf, int len) {
int step, i;
i = step = utf8_decode(buf, len, &cp);
- if(wcwidth(cp) < 0) return step;
+ //if(wcwidth(cp) >= 0) return step;
+ if(!utf8_is_combining(cp) && wcwidth(cp) < 0) return step;
while(i < len) {
step = utf8_decode(buf + i, len - i, &cp);
- if(cp == 0x200D || wcwidth(cp)) break;
+ //if(wcwidth(cp) >= 0) break;
+ if(!utf8_is_combining(cp) || wcwidth(cp)) break;
i += step;
}
return i;
@@ -30,3 +31,11 @@ int
utf8_decode(char *buf, int len, unsigned int *cp) {
return grapheme_decode_utf8(buf, len, cp);
}
+
+int
+utf8_is_combining(unsigned int cp) {
+ const utf8proc_property_t *prop = utf8proc_get_property(cp);
+
+ return (prop->category == UTF8PROC_CATEGORY_MN
+ || prop->category == UTF8PROC_CATEGORY_ME);
+}
diff --git a/utf8.h b/utf8.h
@@ -9,3 +9,4 @@
int utf8_len(char *buf, int len);
size_t utf8_len_compat(char *buf, int len);
int utf8_decode(char *buf, int len, unsigned int *cp);
+int utf8_is_combining(unsigned int cp);