From 2479ab63d016e93b7ebaa5f05d875250b5210687 Mon Sep 17 00:00:00 2001 From: DTB Date: Mon, 27 May 2024 17:38:35 -0600 Subject: [PATCH] libutf: UTF conversion functionality --- src/libenc/libutf.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ src/libenc/libutf.h | 14 ++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 src/libenc/libutf.c create mode 100644 src/libenc/libutf.h diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c new file mode 100644 index 0000000..ef5cf36 --- /dev/null +++ b/src/libenc/libutf.c @@ -0,0 +1,55 @@ +#include /* size_t */ +#include "libutf.h" + +/* UTF-32BE is the big-endian literal encoding of a Unicode codepoint, + * including 11 bits of padding. The following functions convert from and to + * UTF-32. */ + +/* From , UTF-8 is encoded as follows: + * The codepoint 0bZYYYY_XXXXWWWW_VVVVUUUU is nestled within the literal bits 0 + * and 1; letters denote nybbles here, little-endian, and certain bits are + * labeled, also little-endian. */ +/* utf-8 bits |32 |27 |22 |14 |7 + * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * + * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * + * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * + * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ +/* utf-32be bits |32 |21 |18 |12 |6 + * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ +codepoint_t +utf8_to_utf32be(rune_t c){ return + ((c & 0x07000000) >> 6) /* UTF-8 b27-25 -> UTF-32 b21-19 */ + | (((c & 0x003F0000) /* UTF-8 b22-17 -> UTF-32 b18-13 */ + ^ ((c & (1 << 22)) >> 1)) >> 4) /* (if UTF-8 b23, zero b22) */ + | ((c & 0x00003F00) >> 2) /* UTF-8 b14- 9 -> UTF-32 b12- 7 */ + | (c & 0x0000007F); /* UTF-32 b 7- 1 */ +} + +/* is a good + * explanation of this. */ +/* utf-32be bits + * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ +/* utf-32le bits + * U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */ +codepoint_t +utf32be_to_utf32le(codepoint_t c){ return + ((c & 0x000000FF) << 24) + | ((c & 0x0000FF00) << 8) + | ((c & 0x001F0000) >> 8); +} +codepoint_t +utf32le_to_utf32be(codepoint_t c){ return + ((c & 0xFF000000) >> 24) + | ((c & 0x00FF0000) >> 8) + | ((c & 0x00001F00) << 8); +} +/* This operation is symmetrical; swab32(swab32(c)) will always return c. It's + * (very slightly) slower than the specific UTF-32 conversion functions but may + * be useful. */ +codepoint_t +swab32(codepoint_t c){ return + ((c & 0xFF000000) >> 24) + | ((c & 0x00FF0000) >> 8) + | ((c & 0x0000FF00) << 8) + | ((c & 0x000000FF) << 24); +} diff --git a/src/libenc/libutf.h b/src/libenc/libutf.h new file mode 100644 index 0000000..6b004f3 --- /dev/null +++ b/src/libenc/libutf.h @@ -0,0 +1,14 @@ +#if __STDC_VERSION__ >= 199901L +/* C99 type definitions */ +# include +typedef uint32_t rune_t; +#else +/* Must hold at least 32b; see the C89 draft 2.2.4.2 + * */ +typedef unsigned long int rune_t; +#endif + +rune_t swab32(rune_t c); +rune_t utf8_to_utf32be(rune_t c); +rune_t utf32be_to_utf32le(rune_t c); +rune_t utf32le_to_utf32be(rune_t c);