diff --git a/toki/README.txt b/toki/README.txt new file mode 100644 index 0000000..b3f8d84 --- /dev/null +++ b/toki/README.txt @@ -0,0 +1,3 @@ +- move executables somewhere in path +- `$ toki_update` to fetch dictionary +- `$ toki_sitelen` to tokiponize diff --git a/toki/toki_sitelen b/toki/toki_sitelen index b9f8fd7..717a47a 100755 --- a/toki/toki_sitelen +++ b/toki/toki_sitelen @@ -1,7 +1,6 @@ #!/bin/sh -# codepoint -> utf8 -while read -r codepoint; do +toki_ucsur "$@" | while read -r codepoint; do # normalize to U+000000 codepoint="$(printf '%s\n' "$codepoint" | sed 's/^U+//')" codepoint="U+$( \ @@ -39,25 +38,25 @@ while read -r codepoint; do utf8_bin="$(case $bytes in \ 1) printf '0%s\n' "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 7)" ;; \ + | tail -c 8)" ;; \ 2) printf '110%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 11 | head -c 5)" \ + | tail -c 12 | head -c 5)" \ "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 6)" ;; \ + | tail -c 7)" ;; \ 3) printf '1110%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 16 | head -c 4)" \ + | tail -c 17 | head -c 4)" \ "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 12 | head -c 6)" \ + | tail -c 13 | head -c 6)" \ "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 6)" ;; \ + | tail -c 7)" ;; \ 4) printf '11110%s10%s10%s10%s\n' "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 21 | head -c 3)" \ + | tail -c 22 | head -c 3)" \ "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 18 | head -c 6)" \ + | tail -c 19 | head -c 6)" \ "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 12 | head -c 6)" \ + | tail -c 13 | head -c 6)" \ "$(printf '%s\n' "$codepoint_bin" \ - | tail -c 6)" ;; \ + | tail -c 7)" ;; \ esac)" utf8_oct="$(dd if=/dev/zero bs=1 count=$( \ @@ -75,6 +74,7 @@ while read -r codepoint; do -e 's/110/6/g' -e 's/111/7/g' \ | tr -d ' ')" + # a little fucky utf8_hex="$(printf '%s\n' "$utf8_bin" \ | sed \ -e 's/0000/0/g' -e 's/0001/1/g' -e 's/0010/2/g' \ @@ -84,5 +84,6 @@ while read -r codepoint; do -e 's/1100/C/g' -e 's/1101/D/g' -e 's/1110/E/g' \ -e 's/1111/F/g')" + printf '%s\n' "$utf8_bin" shift done