From 039a17f454c84ac5a8b733937cf8717ccc45276b Mon Sep 17 00:00:00 2001 From: DTB Date: Mon, 4 Sep 2023 09:24:37 -0400 Subject: [PATCH] more work --- unicode/libunicode.c | 2 ++ unicode/libunicode.h | 2 ++ unicode/utf8.c | 61 ++++++++++++++++++++------------------------ 3 files changed, 32 insertions(+), 33 deletions(-) diff --git a/unicode/libunicode.c b/unicode/libunicode.c index bed1533..6b78b4f 100644 --- a/unicode/libunicode.c +++ b/unicode/libunicode.c @@ -4,6 +4,8 @@ utf8_t utf8(utf32_t c){ unsigned char n; utf8_t r; + if(c > UTF8_MAX) + c = 0; r = 0; switch(n = (c < 0x10000) + (c < 0x0800) + (c < 0x0080)){ case 0: r = 0xF0 + ((c >> 18) & 0x07); /* 11110xxx */ diff --git a/unicode/libunicode.h b/unicode/libunicode.h index ea9679f..3d1fb48 100644 --- a/unicode/libunicode.h +++ b/unicode/libunicode.h @@ -10,5 +10,7 @@ * strings; >=32b value */ #define utf8_t unicode_codepoint_t +#define UTF8_MAX 0x10FFFF + /* encode UTF-32 value into UTF-8 */ utf8_t utf8(utf32_t c); diff --git a/unicode/utf8.c b/unicode/utf8.c index 8632b40..9114c6b 100644 --- a/unicode/utf8.c +++ b/unicode/utf8.c @@ -16,44 +16,39 @@ int main(int argc, char *argv[]){ utf32_t codepoint; utf8_t encoded; - c = '\0'; + l = 0; + +init: codepoint = 0; i = 0; - l = 1; - while(c != EOF){ - for( - i = 0, n = NULL, codepoint = 0; - (c = getc(stdin)) != '\n' - && c != EOF - && i < 10; - ++i - ){ - if( - (i == 0 && c != 'U') - || (i == 1 && c != '+') - || i > 10 - || (i > 1 && (n = strchr(hex, c)) - == NULL) - ){ + ++l; + n = NULL; + while((c = getc(stdin)) != EOF){ + if(c == '\n'){ + if(i < 2 && i > 0) /* empty lines are fine */ fprintf(stderr, "%s: %s: Syntax error.\n", argv[0], l); - SKIPLINE; - i = -1; - break; + else if(i >= 2){ + encoded = utf8(codepoint); + for(i = 3; i >= 0; --i) + if((encoded >> 8 * i) > 0 || i == 0) + putc(encoded >> 8 * i, stdout); } - if(n != NULL) - codepoint = (codepoint << 4) + (n - hex) % 16; + goto init; } - if(i < 3){ - if(c != '\n' && c != EOF) - SKIPLINE; - if(c == EOF) - return 0; - continue; + if( + (i == 0 && c != 'U') + || (i == 1 && c != '+') + || i > 8 /* strlen("U+10FFFF") */ + || (i > 1 && ((n = strchr(hex, c)) == NULL)) + ){ + fprintf(stderr, "%s: %s: Syntax error.\n", + argv[0], l); + while((c = getc(stdin)) != '\n' && c != EOF); + ++l; + continue; } - encoded = utf8(codepoint); - for(i = 3; i >= 0; --i) - if((encoded >> 8 * i) > 0 || i == 0) - putc(encoded >> 8 * i, stdout); - ++l; + if(n != NULL) + codepoint = (codepoint << 4) + (n - hex) % 16; + ++i; } }