still doesnt work

This commit is contained in:
dtb 2023-09-03 00:24:59 -04:00
parent 9e7b296852
commit 8920624eb6
3 changed files with 109 additions and 86 deletions

View File

@ -1,89 +1,5 @@
toki_ucsur "$@" | while read -r codepoint; do
# normalize to U+000000
codepoint="$(printf '%s\n' "$codepoint" | sed 's/^U+//')"
codepoint="U+$( \
dd if=/dev/zero bs=1 count=$( \
printf '%s\n' "$codepoint" \
| wc -c \
| xargs printf '7 - %s\n' \
| bc \
) 2>/dev/null | tr '\0' 0)$codepoint"
set -e
codepoint_bin="$(printf '%s\n' "$codepoint" \
| sed \
-e 's/^U+//' -e 's/0/0000/g' -e 's/1/0001/g' \
-e 's/2/0010/g' -e 's/3/0011/g' -e 's/4/0100/g' \
-e 's/5/0101/g' -e 's/6/0110/g' -e 's/7/0111/g' \
-e 's/8/1000/g' -e 's/9/1001/g' -e 's/A/1010/g' \
-e 's/B/1011/g' -e 's/C/1100/g' -e 's/D/1101/g' \
-e 's/E/1110/g' -e 's/F/1111/g')"
printf '%s\n' "$codepoint_bin" \
| dd bs=17 count=1 2>/dev/null \
| grep 1 2>/dev/null 1>&2 \
|| bytes=1
printf '%s\n' "$codepoint_bin" \
| dd bs=13 count=1 2>/dev/null \
| grep 1 2>/dev/null 1>&2\
|| bytes=2
printf '%s\n' "$codepoint_bin" \
| dd bs=8 count=1 2>/dev/null \
| grep 1 2>/dev/null 1>&2 \
&& bytes=4 \
|| bytes=3
# TODO: How to bring bin,oct, or hex to actual binary in POSIX?
utf8_bin="$(case $bytes in \
1) printf '0%s\n' "$(printf '%s\n' "$codepoint_bin" \
| tail -c 8)" ;; \
2) printf '110%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \
| tail -c 12 | head -c 5)" \
"$(printf '%s\n' "$codepoint_bin" \
| tail -c 7)" ;; \
3) printf '1110%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \
| tail -c 17 | head -c 4)" \
"$(printf '%s\n' "$codepoint_bin" \
| tail -c 13 | head -c 6)" \
"$(printf '%s\n' "$codepoint_bin" \
| tail -c 7)" ;; \
4) printf '11110%s10%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \
| tail -c 22 | head -c 3)" \
"$(printf '%s\n' "$codepoint_bin" \
| tail -c 19 | head -c 6)" \
"$(printf '%s\n' "$codepoint_bin" \
| tail -c 13 | head -c 6)" \
"$(printf '%s\n' "$codepoint_bin" \
| tail -c 7)" ;; \
utf8_oct="$(dd if=/dev/zero bs=1 count=$( \
printf '%s\n' "$utf8_bin" \
| wc -c \
| xargs printf '34 - %s\n' \
| bc \
) 2>/dev/null \
| tr '\0' 0 \
| xargs printf "%s$utf8_bin\n" \
| sed 's/.../& /g' \
| sed \
-e 's/000/0/g' -e 's/001/1/g' -e 's/010/2/g' \
-e 's/011/3/g' -e 's/100/4/g' -e 's/101/5/g' \
-e 's/110/6/g' -e 's/111/7/g' \
| tr -d ' ')"
# a little fucky
utf8_hex="$(printf '%s\n' "$utf8_bin" \
| sed \
-e 's/0000/0/g' -e 's/0001/1/g' -e 's/0010/2/g' \
-e 's/0011/3/g' -e 's/0100/4/g' -e 's/0101/5/g' \
-e 's/0110/6/g' -e 's/0111/7/g' -e 's/1000/8/g' \
-e 's/1001/9/g' -e 's/1010/A/g' -e 's/1011/B/g' \
-e 's/1100/C/g' -e 's/1101/D/g' -e 's/1110/E/g' \
-e 's/1111/F/g')"
printf '%s\n' "$utf8_bin"
toki_ucsur "$@" | utf8

unicode/Makefile Normal file
View File

@ -0,0 +1,2 @@
utf8: utf8.c
$(CC) -I../ascii -g -o utf8 utf8.c

unicode/utf8.c Normal file
View File

@ -0,0 +1,105 @@
#include <stdio.h> /* fprintf(3), getc(3), stderr, stdin, EOF */
#include <string.h> /* memset(3) */
void print_hexascii(unsigned char *hexes, int n){
if(n % 2 != 0)
while(n --> 0){
putc((char)(((hex - strchr(hex, hexes[0])) << 4)
+ (hex - strchr(hex, hexes[1]))), stdout);
int main(int argc, char *argv[]){
int c;
int i;
int l; /* line counter */
char *n;
unsigned char utf32_hex[8]; /* nybbles */
long int utf32_lit;
unsigned char utf8_hex[8]; /* nybbles */
long int utf8_lit;
c = '\0';
i = 0;
l = 1;
while(c != EOF){
memset(utf32_hex, 0, sizeof utf32_hex);
memset(utf8_hex, '0', sizeof utf8_hex);
i = 0, n = NULL, utf32_lit = 0, utf8_lit = 0;
(c = getc(stdin)) != '\n'
&& c != EOF
&& i < (sizeof utf32_hex) / (sizeof *utf32_hex);
(i == 0 && c != 'U')
|| (i == 1 && c != '+')
|| i > 10
|| (i > 1 && (n = strchr(hex, c))
== NULL)
fprintf(stderr, "%s: %s: Syntax error.\n",
argv[0], l);
while((c = getc(stdin)) != '\n' && c != EOF);
i = -1;
if(n != NULL){
n -= 16;
utf32_hex[i - 2] = *(n -= 16 * (n - hex > 16));
}else if(i >= 2)
utf32_hex[i - 2] = c;
if(i == -1 || i < 3)
while(utf32_hex[7] == '\0'){ /* slow but easy */
for(i = 0; i < 7; ++i)
utf32_hex[i + 1] = utf32_hex[i];
utf32_hex[0] = '0';
/* this code is embarrassing */
for(i = 0; i < 8; ++i)
utf32_lit = (utf32_lit << 4)
+ strchr(hex, utf32_hex[i]) - hex;
if(utf32_lit < 128){
utf8_hex[7] = utf32_hex[7];
utf8_hex[6] = utf32_hex[6];
i = 6;
goto done;
utf8_hex[7] = hex[utf32_lit & 15];
utf8_hex[6] = hex[((utf32_lit >> 4) & 3) + 8];
if(utf32_lit < 2048){
utf8_hex[5] = hex[(utf32_lit >> 6) & 15];
utf8_hex[4] = hex[((utf32_lit >> 10) & 1) + 12];
i = 4;
goto done;
utf8_hex[5] = hex[(utf32_lit >> 6) & 15];
utf8_hex[4] = hex[((utf32_lit >> 10) & 3) + 8];
if(utf32_lit < 65536){
utf8_hex[3] = hex[(utf32_lit >> 12) & 15];
utf8_hex[2] = 14;
i = 2;
goto done;
utf8_hex[3] = hex[(utf32_lit >> 12) & 15];
utf8_hex[2] = hex[((utf32_lit >> 16) & 3) + 8];
utf8_hex[1] = hex[(utf32_lit >> 21) & 3];
utf8_hex[0] = hex[15];
i = 0;
done: print_hexascii(utf8_hex + i, 8 - i);