diff --git a/toki/toki_sitelen b/toki/toki_sitelen index 717a47a..7a576d8 100755 --- a/toki/toki_sitelen +++ b/toki/toki_sitelen @@ -1,89 +1,5 @@ #!/bin/sh -toki_ucsur "$@" | while read -r codepoint; do - # normalize to U+000000 - codepoint="$(printf '%s\n' "$codepoint" | sed 's/^U+//')" - codepoint="U+$( \ - dd if=/dev/zero bs=1 count=$( \ - printf '%s\n' "$codepoint" \ - | wc -c \ - | xargs printf '7 - %s\n' \ - | bc \ - ) 2>/dev/null | tr '\0' 0)$codepoint" +set -e - codepoint_bin="$(printf '%s\n' "$codepoint" \ - | sed \ - -e 's/^U+//' -e 's/0/0000/g' -e 's/1/0001/g' \ - -e 's/2/0010/g' -e 's/3/0011/g' -e 's/4/0100/g' \ - -e 's/5/0101/g' -e 's/6/0110/g' -e 's/7/0111/g' \ - -e 's/8/1000/g' -e 's/9/1001/g' -e 's/A/1010/g' \ - -e 's/B/1011/g' -e 's/C/1100/g' -e 's/D/1101/g' \ - -e 's/E/1110/g' -e 's/F/1111/g')" - - printf '%s\n' "$codepoint_bin" \ - | dd bs=17 count=1 2>/dev/null \ - | grep 1 2>/dev/null 1>&2 \ - || bytes=1 - printf '%s\n' "$codepoint_bin" \ - | dd bs=13 count=1 2>/dev/null \ - | grep 1 2>/dev/null 1>&2\ - || bytes=2 - printf '%s\n' "$codepoint_bin" \ - | dd bs=8 count=1 2>/dev/null \ - | grep 1 2>/dev/null 1>&2 \ - && bytes=4 \ - || bytes=3 - - # TODO: How to bring bin,oct, or hex to actual binary in POSIX? - - utf8_bin="$(case $bytes in \ - 1) printf '0%s\n' "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 8)" ;; \ - 2) printf '110%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 12 | head -c 5)" \ - "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 7)" ;; \ - 3) printf '1110%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 17 | head -c 4)" \ - "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 13 | head -c 6)" \ - "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 7)" ;; \ - 4) printf '11110%s10%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 22 | head -c 3)" \ - "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 19 | head -c 6)" \ - "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 13 | head -c 6)" \ - "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 7)" ;; \ - esac)" - - utf8_oct="$(dd if=/dev/zero bs=1 count=$( \ - printf '%s\n' "$utf8_bin" \ - | wc -c \ - | xargs printf '34 - %s\n' \ - | bc \ - ) 2>/dev/null \ - | tr '\0' 0 \ - | xargs printf "%s$utf8_bin\n" \ - | sed 's/.../& /g' \ - | sed \ - -e 's/000/0/g' -e 's/001/1/g' -e 's/010/2/g' \ - -e 's/011/3/g' -e 's/100/4/g' -e 's/101/5/g' \ - -e 's/110/6/g' -e 's/111/7/g' \ - | tr -d ' ')" - - # a little fucky - utf8_hex="$(printf '%s\n' "$utf8_bin" \ - | sed \ - -e 's/0000/0/g' -e 's/0001/1/g' -e 's/0010/2/g' \ - -e 's/0011/3/g' -e 's/0100/4/g' -e 's/0101/5/g' \ - -e 's/0110/6/g' -e 's/0111/7/g' -e 's/1000/8/g' \ - -e 's/1001/9/g' -e 's/1010/A/g' -e 's/1011/B/g' \ - -e 's/1100/C/g' -e 's/1101/D/g' -e 's/1110/E/g' \ - -e 's/1111/F/g')" - - printf '%s\n' "$utf8_bin" - shift -done +toki_ucsur "$@" | utf8 diff --git a/unicode/Makefile b/unicode/Makefile new file mode 100644 index 0000000..facdada --- /dev/null +++ b/unicode/Makefile @@ -0,0 +1,2 @@ +utf8: utf8.c + $(CC) -I../ascii -g -o utf8 utf8.c diff --git a/unicode/utf8.c b/unicode/utf8.c new file mode 100644 index 0000000..bfe780f --- /dev/null +++ b/unicode/utf8.c @@ -0,0 +1,105 @@ +#include /* fprintf(3), getc(3), stderr, stdin, EOF */ +#include /* memset(3) */ +#include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER, + * ASCII_HEXADECIMAL_DIGITS_UPPER */ + +static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER + ASCII_HEXADECIMAL_DIGITS_LOWER; + +void print_hexascii(unsigned char *hexes, int n){ + if(n % 2 != 0) + return; + while(n --> 0){ + putc((char)(((hex - strchr(hex, hexes[0])) << 4) + + (hex - strchr(hex, hexes[1]))), stdout); + ++hexes; + } +} + +int main(int argc, char *argv[]){ + int c; + int i; + int l; /* line counter */ + char *n; + unsigned char utf32_hex[8]; /* nybbles */ + long int utf32_lit; + unsigned char utf8_hex[8]; /* nybbles */ + long int utf8_lit; + + c = '\0'; + i = 0; + l = 1; + while(c != EOF){ + memset(utf32_hex, 0, sizeof utf32_hex); + memset(utf8_hex, '0', sizeof utf8_hex); + for( + i = 0, n = NULL, utf32_lit = 0, utf8_lit = 0; + (c = getc(stdin)) != '\n' + && c != EOF + && i < (sizeof utf32_hex) / (sizeof *utf32_hex); + ++i + ){ + if( + (i == 0 && c != 'U') + || (i == 1 && c != '+') + || i > 10 + || (i > 1 && (n = strchr(hex, c)) + == NULL) + ){ + fprintf(stderr, "%s: %s: Syntax error.\n", + argv[0], l); + while((c = getc(stdin)) != '\n' && c != EOF); + i = -1; + break; + } + if(n != NULL){ + n -= 16; + utf32_hex[i - 2] = *(n -= 16 * (n - hex > 16)); + }else if(i >= 2) + utf32_hex[i - 2] = c; + } + if(i == -1 || i < 3) + continue; + while(utf32_hex[7] == '\0'){ /* slow but easy */ + for(i = 0; i < 7; ++i) + utf32_hex[i + 1] = utf32_hex[i]; + utf32_hex[0] = '0'; + } + /* this code is embarrassing */ + for(i = 0; i < 8; ++i) + utf32_lit = (utf32_lit << 4) + + strchr(hex, utf32_hex[i]) - hex; + if(utf32_lit < 128){ + utf8_hex[7] = utf32_hex[7]; + utf8_hex[6] = utf32_hex[6]; + i = 6; + goto done; + }else{ + utf8_hex[7] = hex[utf32_lit & 15]; + utf8_hex[6] = hex[((utf32_lit >> 4) & 3) + 8]; + } + if(utf32_lit < 2048){ + utf8_hex[5] = hex[(utf32_lit >> 6) & 15]; + utf8_hex[4] = hex[((utf32_lit >> 10) & 1) + 12]; + i = 4; + goto done; + }else{ + utf8_hex[5] = hex[(utf32_lit >> 6) & 15]; + utf8_hex[4] = hex[((utf32_lit >> 10) & 3) + 8]; + } + if(utf32_lit < 65536){ + utf8_hex[3] = hex[(utf32_lit >> 12) & 15]; + utf8_hex[2] = 14; + i = 2; + goto done; + }else{ + utf8_hex[3] = hex[(utf32_lit >> 12) & 15]; + utf8_hex[2] = hex[((utf32_lit >> 16) & 3) + 8]; + utf8_hex[1] = hex[(utf32_lit >> 21) & 3]; + utf8_hex[0] = hex[15]; + i = 0; + } +done: print_hexascii(utf8_hex + i, 8 - i); + ++l; + } +}