diff --git a/unicode/Makefile b/unicode/Makefile index facdada..fc0b6c0 100644 --- a/unicode/Makefile +++ b/unicode/Makefile @@ -1,2 +1,12 @@ -utf8: utf8.c - $(CC) -I../ascii -g -o utf8 utf8.c +utf8: utf8.o libunicode.o + $(CC) -g -o utf8 libunicode.o utf8.o + +libunicode.o: + +utf8.o: libunicode.h utf8.c + $(CC) -I../ascii -c -g -o utf8.o utf8.c + +clean: + rm -f *.o utf8 + +.PHONY: clean diff --git a/unicode/libunicode.c b/unicode/libunicode.c new file mode 100644 index 0000000..bed1533 --- /dev/null +++ b/unicode/libunicode.c @@ -0,0 +1,21 @@ +#include "libunicode.h" + +utf8_t utf8(utf32_t c){ + unsigned char n; + utf8_t r; + + r = 0; + switch(n = (c < 0x10000) + (c < 0x0800) + (c < 0x0080)){ + case 0: r = 0xF0 + ((c >> 18) & 0x07); /* 11110xxx */ + case 1: r = (r << 8) + (n == 1 + ? 0xE0 + ((c >> 12) & 0x0F) /* 1110xxxx */ + : 0x50 + ((c >> 12) & 0x3F)); /* 10xxxxxx */ + case 2: r = (r << 8) + (n == 2 + ? 0xC0 + ((c >> 6) & 0x1F) /* 110xxxxx */ + : 0x50 + ((c >> 6) & 0x3F)); /* 10xxxxxx */ + case 3: r = (r << 8) + (n == 3 + ? c & 0x7F /* 0xxxxxxx */ + : 0x50 + (c & 0x3F)); /* 10xxxxxx */ + } + return r; +} diff --git a/unicode/libunicode.h b/unicode/libunicode.h new file mode 100644 index 0000000..ea9679f --- /dev/null +++ b/unicode/libunicode.h @@ -0,0 +1,14 @@ +#if defined UINT32_MAX /* indicator is included */ +# define unicode_codepoint_t uint32_t +#else + /* C99 draft 5.2.4.2.1 Sizes of integer types says unsigned long must be able + * to hold 32b */ +# define unicode_codepoint_t unsigned long +#endif +#define utf32_t unicode_codepoint_t +/* for holding the literal numeric value of a utf8 rune, not for assembling + * strings; >=32b value */ +#define utf8_t unicode_codepoint_t + +/* encode UTF-32 value into UTF-8 */ +utf8_t utf8(utf32_t c); diff --git a/unicode/utf8.c b/unicode/utf8.c index 1bed9bf..8632b40 100644 --- a/unicode/utf8.c +++ b/unicode/utf8.c @@ -1,7 +1,8 @@ -#include /* fprintf(3), getc(3), stderr, stdin, EOF */ -#include /* memset(3) */ +#include /* fprintf(3), getc(3), putc(3) stderr, stdin, EOF */ +#include /* strchr(3) */ #include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER, * ASCII_HEXADECIMAL_DIGITS_UPPER */ +#include "libunicode.h" /* utf8(3) */ #define SKIPLINE while((c = getc(stdin)) != '\n' && c != EOF) static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER @@ -12,16 +13,15 @@ int main(int argc, char *argv[]){ int i; int l; /* line counter */ char *n; - long unsigned int utf32_lit; - unsigned char utf8_bytes[8]; + utf32_t codepoint; + utf8_t encoded; c = '\0'; i = 0; l = 1; while(c != EOF){ - memset(utf8_bytes, '0', sizeof utf8_bytes); for( - i = 0, n = NULL, utf32_lit = 0; + i = 0, n = NULL, codepoint = 0; (c = getc(stdin)) != '\n' && c != EOF && i < 10; @@ -41,7 +41,7 @@ int main(int argc, char *argv[]){ break; } if(n != NULL) - utf32_lit = (utf32_lit << 4) + (n - hex) % 16; + codepoint = (codepoint << 4) + (n - hex) % 16; } if(i < 3){ if(c != '\n' && c != EOF) @@ -50,24 +50,10 @@ int main(int argc, char *argv[]){ return 0; continue; } - /* something with the bit math is broken */ - switch(i = (utf32_lit < 0x10000) - + (utf32_lit < 0x0800) - + (utf32_lit < 0x0080)){ - case 0: utf8_bytes[0] = - 0xF0 + ((utf32_lit >> 18) & 0x07); /* 11110xxx */ - case 1: utf8_bytes[1] = i == 1 - ? 0xE0 + ((utf32_lit >> 12) & 0x0F) /* 1110xxxx */ - : 0x50 + ((utf32_lit >> 12) & 0x3F); /* 10xxxxxx */ - case 2: utf8_bytes[2] = i == 2 - ? 0xC0 + ((utf32_lit >> 6) & 0x1F) /* 110xxxxx */ - : 0x50 + ((utf32_lit >> 6) & 0x3F); /* 10xxxxxx */ - case 3: utf8_bytes[3] = i == 3 - ? utf8_bytes[3] = utf32_lit & 0x7F /* 0xxxxxxx */ - : 0x50 + (utf32_lit & 0x3F); /* 10xxxxxx */ - } - for( ; i < 4; ++i) - putc(utf8_bytes[i], stdout); + encoded = utf8(codepoint); + for(i = 3; i >= 0; --i) + if((encoded >> 8 * i) > 0 || i == 0) + putc(encoded >> 8 * i, stdout); ++l; } }