1
0

still broken

This commit is contained in:
dtb 2023-09-02 13:35:40 -04:00
parent a0278729b6
commit 9e7b296852
2 changed files with 16 additions and 12 deletions

3
toki/README.txt Normal file
View File

@ -0,0 +1,3 @@
- move executables somewhere in path
- `$ toki_update` to fetch dictionary
- `$ toki_sitelen` to tokiponize

View File

@ -1,7 +1,6 @@
#!/bin/sh #!/bin/sh
# codepoint -> utf8 toki_ucsur "$@" | while read -r codepoint; do
while read -r codepoint; do
# normalize to U+000000 # normalize to U+000000
codepoint="$(printf '%s\n' "$codepoint" | sed 's/^U+//')" codepoint="$(printf '%s\n' "$codepoint" | sed 's/^U+//')"
codepoint="U+$( \ codepoint="U+$( \
@ -39,25 +38,25 @@ while read -r codepoint; do
utf8_bin="$(case $bytes in \ utf8_bin="$(case $bytes in \
1) printf '0%s\n' "$(printf '%s\n' "$codepoint_bin" \ 1) printf '0%s\n' "$(printf '%s\n' "$codepoint_bin" \
| tail -c 7)" ;; \ | tail -c 8)" ;; \
2) printf '110%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ 2) printf '110%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \
| tail -c 11 | head -c 5)" \ | tail -c 12 | head -c 5)" \
"$(printf '%s\n' "$codepoint_bin" \ "$(printf '%s\n' "$codepoint_bin" \
| tail -c 6)" ;; \ | tail -c 7)" ;; \
3) printf '1110%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ 3) printf '1110%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \
| tail -c 16 | head -c 4)" \ | tail -c 17 | head -c 4)" \
"$(printf '%s\n' "$codepoint_bin" \ "$(printf '%s\n' "$codepoint_bin" \
| tail -c 12 | head -c 6)" \ | tail -c 13 | head -c 6)" \
"$(printf '%s\n' "$codepoint_bin" \ "$(printf '%s\n' "$codepoint_bin" \
| tail -c 6)" ;; \ | tail -c 7)" ;; \
4) printf '11110%s10%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ 4) printf '11110%s10%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \
| tail -c 21 | head -c 3)" \ | tail -c 22 | head -c 3)" \
"$(printf '%s\n' "$codepoint_bin" \ "$(printf '%s\n' "$codepoint_bin" \
| tail -c 18 | head -c 6)" \ | tail -c 19 | head -c 6)" \
"$(printf '%s\n' "$codepoint_bin" \ "$(printf '%s\n' "$codepoint_bin" \
| tail -c 12 | head -c 6)" \ | tail -c 13 | head -c 6)" \
"$(printf '%s\n' "$codepoint_bin" \ "$(printf '%s\n' "$codepoint_bin" \
| tail -c 6)" ;; \ | tail -c 7)" ;; \
esac)" esac)"
utf8_oct="$(dd if=/dev/zero bs=1 count=$( \ utf8_oct="$(dd if=/dev/zero bs=1 count=$( \
@ -75,6 +74,7 @@ while read -r codepoint; do
-e 's/110/6/g' -e 's/111/7/g' \ -e 's/110/6/g' -e 's/111/7/g' \
| tr -d ' ')" | tr -d ' ')"
# a little fucky
utf8_hex="$(printf '%s\n' "$utf8_bin" \ utf8_hex="$(printf '%s\n' "$utf8_bin" \
| sed \ | sed \
-e 's/0000/0/g' -e 's/0001/1/g' -e 's/0010/2/g' \ -e 's/0000/0/g' -e 's/0001/1/g' -e 's/0010/2/g' \
@ -84,5 +84,6 @@ while read -r codepoint; do
-e 's/1100/C/g' -e 's/1101/D/g' -e 's/1110/E/g' \ -e 's/1100/C/g' -e 's/1101/D/g' -e 's/1110/E/g' \
-e 's/1111/F/g')" -e 's/1111/F/g')"
printf '%s\n' "$utf8_bin"
shift shift
done done