commit c5781e38a6882a6e45858d9296e776a5771ccad5
parent 9687b8002c9f51a29fdc403668f63721ff1c9a56
Author: Claudio Alessi <smoppy@gmail.com>
Date: Mon, 5 Jan 2026 00:33:09 +0100
Improve UTF-8 handling and term inconsistences.
Improve Regional Indicator code and add partial support for ambiguous
characters. Now a visual indicator is shown when such characters are at
the end of the line with a trailing space.
Cleanups are needed and likely a bit of refactoring.
Diffstat:
| M | edo.c | | | 5 | +++++ |
| M | tui.c | | | 141 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------- |
2 files changed, 108 insertions(+), 38 deletions(-)
diff --git a/edo.c b/edo.c
@@ -496,6 +496,11 @@ run(void) {
Line *l = vcur->buf->lines[vcur->line_idx];
fprintf(stderr, "debug current line (%d):\n", vcur->line_idx);
fprintf(stderr, "=== START LINE ===\n");
+
+ unsigned int cp;
+ utf8_decode(l->buf, l->len, &cp);
+
+ fprintf(stderr, "cp=%d\n", cp);
for(int i = 0; i < l->len; i++) {
if(!(i % 10)) fprintf(stderr, "\n");
fprintf(stderr, " 0x%0x", l->buf[i]);
diff --git a/tui.c b/tui.c
@@ -20,6 +20,10 @@
#define CLEARRIGHT "\33[0K"
#define CURHIDE "\33[?25l"
#define CURSHOW "\33[?25h"
+#define ERASECHAR "\33[1X"
+
+#define IS_RIS(c) ((c) >= 0x1F1E6 && (c) <= 0x1F1FF)
+#define IS_AMBI(c) ((c) >= 0x2100 && (c) <= 0x26FF)
typedef struct {
char *buf;
@@ -135,21 +139,27 @@ tui_text_width(char *s, int len, int x) {
w += tabstop - x % tabstop;
continue;
}
- if(compat_mode && cp == 0x200D) {
- w += 6; // <200d>
- continue;
- }
- wc = wcwidth(cp);
- if(!compat_mode) {
+
+ wc = -1;
+ if(compat_mode) {
+ if(cp == 0x200D) {
+ w += 6;
+ continue;
+ }
+ if(IS_RIS(cp)) wc = 2;
+ } else {
/* force 2 cells width for emoji followed by VS16 */
int nxi = i + step;
if(nxi < len) {
unsigned int nxcp;
utf8_decode(s + nxi, len - nxi, &nxcp);
+
if(nxcp == 0xFE0F && ((cp >= 0x203C && cp <= 0x3299) || cp >= 0x1F000))
wc = 2;
}
}
+
+ if(wc == -1) wc = wcwidth(cp);
if(wc > 0) w += wc;
}
return w;
@@ -179,11 +189,11 @@ tui_move_cursor(int c, int r) {
}
void
-tui_draw_line(UI *ui, int x, int y, Cell *cells, int count) {
- assert(x < ws.ws_col && y < ws.ws_row);
+tui_draw_line_compat(UI *ui, int x, int y, Cell *cells, int count) {
char *txt;
unsigned int cp = 0;
int was_emoji = 0;
+ int was_ambi = 0;
int i;
tui_move_cursor(x, y);
@@ -191,47 +201,102 @@ tui_draw_line(UI *ui, int x, int y, Cell *cells, int count) {
x += cells[i].width;
txt = cell_get_text(cells + i, ui->pool.data);
+ int cw = cells[i].width;
+
/* TODO: temp code for testing, we'll se how to deal with this later */
- if(txt[0] == '\t')
+ if(txt[0] == '\t') {
ab_printf(&frame, "%*s", cells[i].width, " ");
- else {
- if(compat_mode) {
- int o = 0;
- while(o < cells[i].len) {
- int step = utf8_decode(txt + o, cells[i].len - o, &cp);
-
- if(cp == 0x200D) {
- ab_write(&frame, "<200d>", cells[i].width);
- }
- else {
- if(was_emoji) {
- const char t[] = "\x1b[0;48;5;232m";
- ab_write(&frame, t, sizeof t);
- }
- ab_write(&frame, txt + o, step);
- if(was_emoji) {
- const char t[] = "\x1b[0m";
- ab_write(&frame, t, sizeof t);
+ } else {
+ int o = 0;
+
+ while(o < cells[i].len) {
+ int step = utf8_decode(txt + o, cells[i].len - o, &cp);
+
+ if(cp == 0x200D) {
+ ab_write(&frame, "<200d>", cells[i].width);
+ } else {
+ int w = wcwidth(cp);
+ if(w > 0) cw = w;
+
+ if(was_emoji) {
+ if(was_ambi && i == count - 1) {
+ const char t2[] = "\x1b[48;5;1m";
+ ab_write(&frame, t2, sizeof t2 - 1);
+ } else {
+ const char t[] = "\x1b[48;5;232m";
+ ab_write(&frame, t, sizeof t - 1);
+
+ /* clear eventual garbage state */
+ ab_write(&frame, ERASECHAR, strlen(ERASECHAR));
}
}
- o += step;
+ ab_write(&frame, txt + o, step);
+
+ if(was_emoji) {
+ const char t[] = "\x1b[0m";
+ ab_write(&frame, t, sizeof t - 1);
+ }
}
+ o += step;
}
- else
- ab_write(&frame, txt, cells[i].len);
}
- /* Instead of check for 0xFE0F (or detect other problematic emojis) just reposition
- * the cursor for any multi-byte grapheme cluster. This may be further optimized in
- * the future but for now it's a robust way to solve any eventual incongruences.
- * It's important to skip this for "glue" char to avoid breaking sequences which
- * must always be adiacent (e.g. regional indicator symbols). */
- int is_glue = (cp >= 0x1F1E6 && cp <= 0x1F1FF);
- if(cells[i].len > 1 && !is_glue) tui_move_cursor(x, y);
+ /* pad if needed */
+ if(cw < cells[i].width) {
+ tui_move_cursor(x - cells[i].width + cw, y);
+
+ const char t[] = "\x1b[48;5;233m";
+ ab_write(&frame, t, sizeof t - 1);
+
+ while(cw++ < cells[i].width) ab_write(&frame, " ", 1);
+
+ const char t2[] = "\x1b[0m";
+ ab_write(&frame, t2, sizeof t2 - 1);
+ }
+
+ was_emoji = cells[i].len > 1 || IS_RIS(cp);
+ if(was_emoji) tui_move_cursor(x, y);
+ was_ambi = IS_AMBI(cp);
+ }
+ ab_write(&frame, CLEARRIGHT, strlen(CLEARRIGHT));
+}
+
+
+void
+tui_draw_line(UI *ui, int x, int y, Cell *cells, int count) {
+ assert(x < ws.ws_col && y < ws.ws_row);
+
+ if(compat_mode) {
+ tui_draw_line_compat(ui, x, y, cells, count);
+ return;
+ }
+
+ char *txt;
+ unsigned int cp;
+ int was_ambi = 0, i;
+
+ tui_move_cursor(x, y);
+ for(i = 0; i < count; i++) {
+ x += cells[i].width;
+ txt = cell_get_text(cells + i, ui->pool.data);
+
+ /* TODO: temp code for testing, we'll se how to deal with this later */
+ if(txt[0] == '\t')
+ ab_printf(&frame, "%*s", cells[i].width, " ");
+ else {
+ utf8_decode(txt, cells[i].len, &cp);
+ if(was_ambi && i == count - 1) {
+ const char t[] = "\x1b[48;5;1m";
+ ab_write(&frame, t, sizeof t - 1);
+ }
- if(compat_mode) was_emoji = cells[i].len > 1 && !is_glue;
+ ab_write(&frame, txt, cells[i].len);
+ const char t[] = "\x1b[0m";
+ ab_write(&frame, t, sizeof t - 1);
+ }
+ was_ambi = IS_AMBI(cp);
}
ab_write(&frame, CLEARRIGHT, strlen(CLEARRIGHT));
}