From 1689f7822599a011434c0f0645fac96f4d4cb906 Mon Sep 17 00:00:00 2001 From: DTB Date: Sun, 3 Sep 2023 02:23:32 -0400 Subject: [PATCH] might work now --- unicode/utf8.c | 91 ++++++++++++++++---------------------------------- 1 file changed, 29 insertions(+), 62 deletions(-) diff --git a/unicode/utf8.c b/unicode/utf8.c index bfe780f..24f6c7b 100644 --- a/unicode/utf8.c +++ b/unicode/utf8.c @@ -2,41 +2,29 @@ #include /* memset(3) */ #include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER, * ASCII_HEXADECIMAL_DIGITS_UPPER */ +#define SKIPLINE while((c = getc(stdin)) != '\n' && c != EOF) static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER ASCII_HEXADECIMAL_DIGITS_LOWER; -void print_hexascii(unsigned char *hexes, int n){ - if(n % 2 != 0) - return; - while(n --> 0){ - putc((char)(((hex - strchr(hex, hexes[0])) << 4) - + (hex - strchr(hex, hexes[1]))), stdout); - ++hexes; - } -} - int main(int argc, char *argv[]){ int c; int i; int l; /* line counter */ char *n; - unsigned char utf32_hex[8]; /* nybbles */ long int utf32_lit; - unsigned char utf8_hex[8]; /* nybbles */ - long int utf8_lit; + unsigned char utf8_bytes[8]; c = '\0'; i = 0; l = 1; while(c != EOF){ - memset(utf32_hex, 0, sizeof utf32_hex); - memset(utf8_hex, '0', sizeof utf8_hex); + memset(utf8_bytes, '0', sizeof utf8_bytes); for( - i = 0, n = NULL, utf32_lit = 0, utf8_lit = 0; + i = 0, n = NULL, utf32_lit = 0; (c = getc(stdin)) != '\n' && c != EOF - && i < (sizeof utf32_hex) / (sizeof *utf32_hex); + && i < 10; ++i ){ if( @@ -48,58 +36,37 @@ int main(int argc, char *argv[]){ ){ fprintf(stderr, "%s: %s: Syntax error.\n", argv[0], l); - while((c = getc(stdin)) != '\n' && c != EOF); + SKIPLINE; i = -1; break; } - if(n != NULL){ - n -= 16; - utf32_hex[i - 2] = *(n -= 16 * (n - hex > 16)); - }else if(i >= 2) - utf32_hex[i - 2] = c; + if(n != NULL) + utf32_lit = (utf32_lit << 4) + (n - hex) % 16; } - if(i == -1 || i < 3) + if(i < 3){ + if(c != '\n' && c != EOF) + SKIPLINE; + if(c == EOF) + return 0; continue; - while(utf32_hex[7] == '\0'){ /* slow but easy */ - for(i = 0; i < 7; ++i) - utf32_hex[i + 1] = utf32_hex[i]; - utf32_hex[0] = '0'; } - /* this code is embarrassing */ - for(i = 0; i < 8; ++i) - utf32_lit = (utf32_lit << 4) - + strchr(hex, utf32_hex[i]) - hex; - if(utf32_lit < 128){ - utf8_hex[7] = utf32_hex[7]; - utf8_hex[6] = utf32_hex[6]; - i = 6; - goto done; - }else{ - utf8_hex[7] = hex[utf32_lit & 15]; - utf8_hex[6] = hex[((utf32_lit >> 4) & 3) + 8]; + switch(i = (utf32_lit < 65536) + + (utf32_lit < 2048) + + (utf32_lit < 128)){ + case 0: utf8_bytes[0] = + ((utf32_lit >> 18) & 7) + 240; /* 11110xxx */ + case 1: utf8_bytes[1] = i == 1 + ? ((utf32_lit >> 12) & 15) + 224 /* 1110xxxx */ + : ((utf32_lit >> 12) & 63) + 80; /* 10xxxxxx */ + case 2: utf8_bytes[2] = i == 2 + ? ((utf32_lit >> 6) & 31) + 192 /* 110xxxxx */ + : ((utf32_lit >> 6) & 63) + 80; /* 10xxxxxx */ + case 3: utf8_bytes[3] = i == 3 + ? utf8_bytes[3] = utf32_lit & 127 /* 0xxxxxxx */ + : (utf32_lit & 63) + 80; /* 10xxxxxx */ } - if(utf32_lit < 2048){ - utf8_hex[5] = hex[(utf32_lit >> 6) & 15]; - utf8_hex[4] = hex[((utf32_lit >> 10) & 1) + 12]; - i = 4; - goto done; - }else{ - utf8_hex[5] = hex[(utf32_lit >> 6) & 15]; - utf8_hex[4] = hex[((utf32_lit >> 10) & 3) + 8]; - } - if(utf32_lit < 65536){ - utf8_hex[3] = hex[(utf32_lit >> 12) & 15]; - utf8_hex[2] = 14; - i = 2; - goto done; - }else{ - utf8_hex[3] = hex[(utf32_lit >> 12) & 15]; - utf8_hex[2] = hex[((utf32_lit >> 16) & 3) + 8]; - utf8_hex[1] = hex[(utf32_lit >> 21) & 3]; - utf8_hex[0] = hex[15]; - i = 0; - } -done: print_hexascii(utf8_hex + i, 8 - i); + for( ; i < 4; ++i) + putc(utf8_bytes[i], stdout); ++l; } }