edo

Experimental text editor.
Log | Files | Refs | LICENSE

commit a081b6203f1dc35ecacfa17d337ef05ccf3ad60f
parent e21d98ab8a425faa5b86ca2aef7fe2a2eda5983f
Author: Claudio Alessi <smoppy@gmail.com>
Date:   Sat,  3 Jan 2026 20:54:24 +0100

Add UTF-8 support with SSO strategy.

Use arena allocator for clusters above a given threshold (currently 8 bytes).
This works with 99% of text while allowing virtually unlimited UTF-8 sequences
transparently.

Code points are decoded using libgrapheme, which is now a dependency.

This change addresses inconsistencies between simple terminals and the modern
ones. In general simple terminals blindly trust libc's wcwidth() which is
notoriously broken while modern terminals use internal rendering logic. To
support both worlds several strategies have been implemented regarding wide
glyph, emoji and VS16 sequences.

For example cursor is explicitly reset to the logical grid position after
printing multi-byte characters but skips "glue characters" (e.g. regional
indicators) to preserve byte streams which is crucial for features like flag
compositions.

Another example is visual clipping which enforce explicit background colors for
cells following wide glyphs. This forces a redraw even on terminals with
aggressive optimizations (like st) where the text partially overlaps with the
previous glyph.

These and other hacks are toggled via a compat flag in the TUI backend. As a
consequence the core do not calculate cluster width directly but queries the
backend which handle complex clusters (e.g. splitting "lego blocks") similar to
other TUI editors.

Compat mode will be auto-detected at startup (not yet implemented).

Currently only view_cursor_right() is UTF-8 aware which was used to develop.

Diffstat:
MMakefile | 2+-
Mconfig.mk | 1+
Medo.c | 149++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mtui.c | 97++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mui.h | 1+
Autf8.c | 35+++++++++++++++++++++++++++++++++++
Autf8.h | 5+++++
7 files changed, 213 insertions(+), 77 deletions(-)

diff --git a/Makefile b/Makefile @@ -3,7 +3,7 @@ include config.mk APPNAME=edo -SRC = ${APPNAME}.c tui.c +SRC = ${APPNAME}.c tui.c utf8.c OBJ = ${SRC:.c=.o} all: options ${APPNAME} diff --git a/config.mk b/config.mk @@ -8,6 +8,7 @@ MANPREFIX = ${PREFIX}/share/man # flags CPPFLAGS = -D_DEFAULT_SOURCE -DVERSION=\"${VERSION}\" CFLAGS = -std=c99 -g -pedantic -Wall -O0 ${CPPFLAGS} +LDFLAGS = -lgrapheme #CFLAGS = -std=c99 -pedantic -Wall -Wno-deprecated-declarations -Os ${CPPFLAGS} # compiler and linker diff --git a/edo.c b/edo.c @@ -1,7 +1,3 @@ -/* osaentuhaosenuhaoesnuthaoesnutha oesnthaoesuntha snethu asoenhu saoenhtuaoesn uthaoesunthaoesuntaoeh usaoneth asoenth aoesnth aoesnthaoseuthaoseuthaoesunthaoeusnh asoentuh */ - -/* 👩‍❤️‍💋‍👩 */ - #include <assert.h> #include <stdarg.h> #include <stdint.h> @@ -10,6 +6,7 @@ #include <string.h> #include <unistd.h> +#include "utf8.h" #include "ui.h" typedef struct { @@ -29,10 +26,10 @@ typedef struct { typedef struct { Buffer *buf; - int line_num; - int col_num; - int row_offset; - int col_offset; + int line_idx; + int col_idx; + int row_off; + int col_off; int screen_rows; int screen_cols; //int pref_col; @@ -260,10 +257,10 @@ View * view_create(Buffer *b) { View *v = ecalloc(1, sizeof(View)); - v->line_num = 0; - v->col_num = 0; - v->row_offset = 0; - v->col_offset = 0; + v->line_idx = 0; + v->col_idx = 0; + v->row_off = 0; + v->col_off = 0; v->buf = b; ui->get_window_size(&v->screen_rows, &v->screen_cols); @@ -285,73 +282,85 @@ view_cursor_fix(View *v) { /* actual invariant for the cursor */ void view_cursor_hfix(View *v) { - Line *l = v->buf->lines[v->line_num]; + Line *l = v->buf->lines[v->line_idx]; - if (v->col_num < 0) v->col_num = 0; - if (v->col_num > l->len) v->col_num = l->len; + if (v->col_idx < 0) v->col_idx = 0; + if (v->col_idx > l->len) v->col_idx = l->len; } void view_cursor_vfix(View *v) { - if (v->line_num >= v->buf->lines_tot) - v->line_num = v->buf->lines_tot - 1; - if (v->line_num < 0) - v->line_num = 0; + if (v->line_idx >= v->buf->lines_tot) + v->line_idx = v->buf->lines_tot - 1; + if (v->line_idx < 0) + v->line_idx = 0; } void view_cursor_left(View *v) { - if(v->col_num) - --v->col_num; + /* TODO: decode UTF-8 */ + if(v->col_idx) + --v->col_idx; } void view_cursor_right(View *v) { - Line *l = v->buf->lines[v->line_num]; + Line *l = v->buf->lines[v->line_idx]; - if(v->col_num < l->len) - ++v->col_num; + if(v->col_idx < l->len) { + int len = ui->text_len(l->buf + v->col_idx, l->len - v->col_idx); + v->col_idx += len; + } } void view_cursor_up(View *v) { - if(v->line_num) { - --v->line_num; + if(v->line_idx) { + --v->line_idx; view_cursor_hfix(v); } } void view_cursor_down(View *v) { - if(v->line_num < v->buf->lines_tot - 1) { - ++v->line_num; + if(v->line_idx < v->buf->lines_tot - 1) { + ++v->line_idx; view_cursor_hfix(v); } } int -view_idx2col(View *v, Line *line, int idx) { - (void)v; - if(!line->len) return 0; - return measure_span(line->buf, idx, 0); +view_idx2col(View *v, Line *line, int target_idx) { + int x = 0; + int i = 0; + size_t len; + + if (target_idx > line->len) target_idx = line->len; + + while (i < target_idx) { + len = ui->text_len(line->buf + i, line->len - i); + x += ui->text_width(line->buf + i, len, x); + i += len; + } + return x; } void view_scroll_fix(View *v) { /* vertical */ - if (v->line_num < v->row_offset) - v->row_offset = v->line_num; - if (v->line_num >= v->row_offset + v->screen_rows) - v->row_offset = v->line_num - v->screen_rows + 1; + if (v->line_idx < v->row_off) + v->row_off = v->line_idx; + if (v->line_idx >= v->row_off + v->screen_rows) + v->row_off = v->line_idx - v->screen_rows + 1; /* horizontal */ - Line *l = buffer_get_line(v->buf, v->line_num); - int vx = view_idx2col(v, l, v->col_num); + Line *l = buffer_get_line(v->buf, v->line_idx); + int vx = view_idx2col(v, l, v->col_idx); - if(vx < v->col_offset) - v->col_offset = vx; - if(vx >= v->col_offset + v->screen_cols) - v->col_offset = vx - v->screen_cols + 1; + if(vx < v->col_off) + v->col_off = vx; + if(vx >= v->col_off + v->screen_cols) + v->col_off = vx - v->screen_cols + 1; } int @@ -372,7 +381,7 @@ render(Cell *cells, char *buf, int buflen, int xoff, int cols) { int w, len, x; while(i < buflen) { - len = 1; /* TODO: decode UTF8 */ + len = ui->text_len(buf + i, buflen - i); w = ui->text_width(buf + i, len, vx); if(vx + w <= xoff) goto next; /* horizontal scroll */ @@ -381,16 +390,15 @@ render(Cell *cells, char *buf, int buflen, int xoff, int cols) { if(x >= cols) break; /* screen has been filled */ if(x + w > cols) break; /* truncated character (TODO: draw a symbol?) */ - if(len > CELL_POOL_THRESHOLD) { - /* TODO: manage pool */ - die("Arena pool to be implemented.\n"); - } - else { + if(len > CELL_POOL_THRESHOLD) + cells[nc].data.pool_idx = textpool_insert(&ui->pool, buf + i, len); + else memcpy(cells[nc].data.text, buf + i, len); - } cells[nc].len = len; cells[nc].width = w; + + /* TODO: handle truncated multi-column characters */ if(vx < xoff) cells[nc].width -= xoff - vx; /* partial rendering */ ++nc; @@ -408,12 +416,12 @@ view_place_cursor(View *v) { Line *l; int x, y; - x = v->col_offset; - l = buffer_get_line(v->buf, v->line_num); + x = v->col_off; + l = buffer_get_line(v->buf, v->line_idx); if(l) { - x = view_idx2col(v, l, v->col_num); - x -= v->col_offset; - y = v->line_num - v->row_offset; + x = view_idx2col(v, l, v->col_idx); + x -= v->col_off; + y = v->line_idx - v->row_off; } else { x = y = 0; } @@ -425,19 +433,20 @@ draw_view(View *v) { Line *l; int row, y, nc; + ui->pool.len = 0; ui->frame_start(); view_scroll_fix(v); Cell *cells = ecalloc(1, sizeof(Cell) * v->screen_cols); for(y = 0; y < v->screen_rows; y++) { - row = v->row_offset + y; + row = v->row_off + y; l = buffer_get_line(v->buf, row); if(!l) { ui->draw_symbol(0, y, SYM_EMPTYLINE); continue; } - nc = render(cells, l->buf, l->len, v->col_offset, v->screen_cols); + nc = render(cells, l->buf, l->len, v->col_off, v->screen_cols); ui->draw_line(ui, 0, y, cells, nc); } @@ -459,12 +468,12 @@ textpool_ensure_cap(TextPool *pool, int len) { int textpool_insert(TextPool *pool, char *s, int len) { - int olen = len; + int idx = pool->len; textpool_ensure_cap(pool, len); memcpy(pool->data + pool->len, s, len); pool->len += len; - return olen; + return idx; } char * @@ -483,30 +492,40 @@ run(void) { switch(ev.type) { case EV_KEY: if(ev.key == 'k') view_cursor_up(vcur); + else if(ev.key == 'p') { + Line *l = vcur->buf->lines[vcur->line_idx]; + fprintf(stderr, "debug current line (%d):\n", vcur->line_idx); + fprintf(stderr, "=== START LINE ===\n"); + for(int i = 0; i < l->len; i++) { + if(!(i % 10)) fprintf(stderr, "\n"); + fprintf(stderr, " 0x%0x", l->buf[i]); + } + fprintf(stderr, "\n=== END LINE ==="); + } else if(ev.key == 'j') view_cursor_down(vcur); else if(ev.key == 'h') view_cursor_left(vcur); else if(ev.key == 'l') view_cursor_right(vcur); else if(ev.key == 'q') running = 0; else if(ev.key == 'D') { - buffer_delete_line(vcur->buf, vcur->line_num, 1); + buffer_delete_line(vcur->buf, vcur->line_idx, 1); view_cursor_fix(vcur); } else if(ev.key == 'K') { Line *l = line_create(NULL); - buffer_insert_line(vcur->buf, vcur->line_num, l); + buffer_insert_line(vcur->buf, vcur->line_idx, l); /* we should call view_cursor_hfix() here since we're moving into - * another line (the new one). Since only col_num may be wrong we + * another line (the new one). Since only col_idx may be wrong we * can avoid a function call by setting it manually. */ - vcur->col_num = 0; + vcur->col_idx = 0; } else if(ev.key == 'J' || ev.key == '\n') { Line *l = line_create(NULL); - buffer_insert_line(vcur->buf, vcur->line_num + 1, l); + buffer_insert_line(vcur->buf, vcur->line_idx + 1, l); view_cursor_down(vcur); } else { /* TODO: view_insert_text()? */ - line_insert_text(vcur->buf->lines[vcur->line_num], vcur->col_num, (char *)&ev.key, 1); - vcur->col_num += 1; + line_insert_text(vcur->buf->lines[vcur->line_idx], vcur->col_idx, (char *)&ev.key, 1); + vcur->col_idx += 1; } break; case EV_UKN: diff --git a/tui.c b/tui.c @@ -1,4 +1,6 @@ -#include "ui.h" +#define _XOPEN_SOURCE +#define _BSD_SOURCE +#include <wchar.h> #include <assert.h> #include <stdarg.h> @@ -10,6 +12,9 @@ #include <termios.h> #include <unistd.h> +#include "utf8.h" +#include "ui.h" + #define CURPOS "\33[%d;%dH" //#define CLEARLEFT "\33[1K" #define CLEARRIGHT "\33[0K" @@ -22,10 +27,13 @@ typedef struct { int cap; } Abuf; +/* globals */ struct termios origti; struct winsize ws; Abuf frame; +int compat_mode; +/* TODO: edo.h? */ extern void *ecalloc(size_t nmemb, size_t size); extern void *erealloc(void *p, size_t size); extern void die(const char *fmt, ...); @@ -40,6 +48,7 @@ void ab_flush(Abuf *ab); void tui_frame_start(void); void tui_frame_flush(void); int tui_text_width(char *s, int len, int x); +int tui_text_len(char *s, int len); void tui_get_window_size(int *rows, int *cols); void tui_exit(void); void tui_move_cursor(int x, int y); @@ -117,17 +126,40 @@ int tui_text_width(char *s, int len, int x) { int tabstop = 8; int w = 0, i; + int step, wc; + unsigned int cp; - for(i = 0; i < len; i++) { - if(s[i] == '\t') + for(i = 0; i < len; i += step) { + step = utf8_decode(s + i, len - i, &cp); + if(cp == '\t') { w += tabstop - x % tabstop; - else - ++w; - x += w; + continue; + } + if(compat_mode && cp == 0x200D) { + w += 6; // <200d> + continue; + } + wc = wcwidth(cp); + if(!compat_mode) { + /* force 2 cells width for emoji followed by VS16 */ + int nxi = i + step; + if(nxi < len) { + unsigned int nxcp; + utf8_decode(s + nxi, len - nxi, &nxcp); + if(nxcp == 0xFE0F && ((cp >= 0x203C && cp <= 0x3299) || cp >= 0x1F000)) + wc = 2; + } + } + if(wc > 0) w += wc; } return w; } +int +tui_text_len(char *s, int len) { + return compat_mode ? utf8_len_compat(s, len) : utf8_len(s, len); +} + void tui_get_window_size(int *rows, int *cols) { *rows = ws.ws_row; @@ -149,19 +181,57 @@ tui_move_cursor(int c, int r) { void tui_draw_line(UI *ui, int x, int y, Cell *cells, int count) { assert(x < ws.ws_col && y < ws.ws_row); - int w = 0, i; char *txt; + unsigned int cp = 0; + int was_emoji = 0; + int i; tui_move_cursor(x, y); for(i = 0; i < count; i++) { - w += cells[i].width; - txt = cell_get_text(&cells[i], ui->pool.data); + x += cells[i].width; + txt = cell_get_text(cells + i, ui->pool.data); /* TODO: temp code for testing, we'll se how to deal with this later */ if(txt[0] == '\t') ab_printf(&frame, "%*s", cells[i].width, " "); - else - ab_write(&frame, txt, cells[i].len); + else { + if(compat_mode) { + int o = 0; + while(o < cells[i].len) { + int step = utf8_decode(txt + o, cells[i].len - o, &cp); + + if(cp == 0x200D) { + ab_write(&frame, "<200d>", cells[i].width); + } + else { + if(was_emoji) { + const char t[] = "\x1b[0;48;5;232m"; + ab_write(&frame, t, sizeof t); + } + ab_write(&frame, txt + o, step); + if(was_emoji) { + const char t[] = "\x1b[0m"; + ab_write(&frame, t, sizeof t); + } + } + o += step; + + } + } + else + ab_write(&frame, txt, cells[i].len); + } + + /* Instead of check for 0xFE0F (or detect other problematic emojis) just reposition + * the cursor for any multi-byte grapheme cluster. This may be further optimized in + * the future but for now it's a robust way to solve any eventual incongruences. + * It's important to skip this for "glue" char to avoid breaking sequences which + * must always be adiacent (e.g. regional indicator symbols). */ + int is_glue = (cp >= 0x1F1E6 && cp <= 0x1F1FF); + if(cells[i].len > 1 && !is_glue) tui_move_cursor(x, y); + + if(compat_mode) was_emoji = cells[i].len > 1 && !is_glue; + } ab_write(&frame, CLEARRIGHT, strlen(CLEARRIGHT)); } @@ -192,6 +262,10 @@ tui_init(void) { tcsetattr(0, TCSAFLUSH, &ti); setbuf(stdout, NULL); ioctl(0, TIOCGWINSZ, &ws); + + /* check for compat mode */ + /* TODO: auto-detect */ + compat_mode = 0; } int @@ -225,6 +299,7 @@ UI ui_tui = { .frame_start = tui_frame_start, .frame_flush = tui_frame_flush, .text_width = tui_text_width, + .text_len = tui_text_len, .move_cursor = tui_move_cursor, .draw_line = tui_draw_line, .draw_symbol = tui_draw_symbol, diff --git a/ui.h b/ui.h @@ -46,6 +46,7 @@ struct UI { void (*frame_start)(void); void (*frame_flush)(void); int (*text_width)(char *s, int len, int x); + int (*text_len)(char *s, int len); void (*move_cursor)(int x, int y); void (*draw_line)(UI *ui, int x, int y, Cell *cells, int count); void (*draw_symbol)(int r, int c, Symbol sym); diff --git a/utf8.c b/utf8.c @@ -0,0 +1,35 @@ +#define _XOPEN_SOURCE +#include <wchar.h> +#include <grapheme.h> + +int utf8_len(char *buf, int len); +int utf8_decode(char *buf, int len, unsigned int *cp); +size_t utf8_len_compat(char *buf, int len); + +size_t +utf8_len_compat(char *buf, int len) { + int i = 0, step, next; + uint_least32_t cp; + + step = utf8_decode(buf, len, &cp); + i += step; + + if(cp == 0x200D) return i; + while(i < len) { + next = utf8_decode(buf + i, len - i, &cp); + if(cp == 0x200D) break; + if(wcwidth((wchar_t)cp) > 0) break; + i += next; + } + return i; +} + +int +utf8_len(char *buf, int len) { + return grapheme_next_character_break_utf8(buf, len); +} + +int +utf8_decode(char *buf, int len, unsigned int *cp) { + return grapheme_decode_utf8(buf, len, (uint_least32_t *)cp); +} diff --git a/utf8.h b/utf8.h @@ -0,0 +1,5 @@ +#include <stdlib.h> + +int utf8_len(char *buf, int len); +size_t utf8_len_compat(char *buf, int len); +int utf8_decode(char *buf, int len, unsigned int *cp);