From 2479ab63d016e93b7ebaa5f05d875250b5210687 Mon Sep 17 00:00:00 2001 From: DTB Date: Mon, 27 May 2024 17:38:35 -0600 Subject: [PATCH 1/8] libutf: UTF conversion functionality --- src/libenc/libutf.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ src/libenc/libutf.h | 14 ++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 src/libenc/libutf.c create mode 100644 src/libenc/libutf.h diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c new file mode 100644 index 0000000..ef5cf36 --- /dev/null +++ b/src/libenc/libutf.c @@ -0,0 +1,55 @@ +#include /* size_t */ +#include "libutf.h" + +/* UTF-32BE is the big-endian literal encoding of a Unicode codepoint, + * including 11 bits of padding. The following functions convert from and to + * UTF-32. */ + +/* From , UTF-8 is encoded as follows: + * The codepoint 0bZYYYY_XXXXWWWW_VVVVUUUU is nestled within the literal bits 0 + * and 1; letters denote nybbles here, little-endian, and certain bits are + * labeled, also little-endian. */ +/* utf-8 bits |32 |27 |22 |14 |7 + * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * + * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * + * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * + * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ +/* utf-32be bits |32 |21 |18 |12 |6 + * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ +codepoint_t +utf8_to_utf32be(rune_t c){ return + ((c & 0x07000000) >> 6) /* UTF-8 b27-25 -> UTF-32 b21-19 */ + | (((c & 0x003F0000) /* UTF-8 b22-17 -> UTF-32 b18-13 */ + ^ ((c & (1 << 22)) >> 1)) >> 4) /* (if UTF-8 b23, zero b22) */ + | ((c & 0x00003F00) >> 2) /* UTF-8 b14- 9 -> UTF-32 b12- 7 */ + | (c & 0x0000007F); /* UTF-32 b 7- 1 */ +} + +/* is a good + * explanation of this. */ +/* utf-32be bits + * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ +/* utf-32le bits + * U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */ +codepoint_t +utf32be_to_utf32le(codepoint_t c){ return + ((c & 0x000000FF) << 24) + | ((c & 0x0000FF00) << 8) + | ((c & 0x001F0000) >> 8); +} +codepoint_t +utf32le_to_utf32be(codepoint_t c){ return + ((c & 0xFF000000) >> 24) + | ((c & 0x00FF0000) >> 8) + | ((c & 0x00001F00) << 8); +} +/* This operation is symmetrical; swab32(swab32(c)) will always return c. It's + * (very slightly) slower than the specific UTF-32 conversion functions but may + * be useful. */ +codepoint_t +swab32(codepoint_t c){ return + ((c & 0xFF000000) >> 24) + | ((c & 0x00FF0000) >> 8) + | ((c & 0x0000FF00) << 8) + | ((c & 0x000000FF) << 24); +} diff --git a/src/libenc/libutf.h b/src/libenc/libutf.h new file mode 100644 index 0000000..6b004f3 --- /dev/null +++ b/src/libenc/libutf.h @@ -0,0 +1,14 @@ +#if __STDC_VERSION__ >= 199901L +/* C99 type definitions */ +# include +typedef uint32_t rune_t; +#else +/* Must hold at least 32b; see the C89 draft 2.2.4.2 + * */ +typedef unsigned long int rune_t; +#endif + +rune_t swab32(rune_t c); +rune_t utf8_to_utf32be(rune_t c); +rune_t utf32be_to_utf32le(rune_t c); +rune_t utf32le_to_utf32be(rune_t c); -- 2.46.1 From 7ed5c95e0f4bee427c48150dab92ae3229bc8c68 Mon Sep 17 00:00:00 2001 From: DTB Date: Wed, 29 May 2024 16:32:31 -0600 Subject: [PATCH 2/8] libutf: utf32be_to_utf8 --- src/libenc/libutf.c | 84 +++++++++++++++++++++++++++++++++++---------- src/libenc/libutf.h | 2 ++ 2 files changed, 68 insertions(+), 18 deletions(-) diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c index ef5cf36..8812577 100644 --- a/src/libenc/libutf.c +++ b/src/libenc/libutf.c @@ -1,14 +1,37 @@ #include /* size_t */ #include "libutf.h" -/* UTF-32BE is the big-endian literal encoding of a Unicode codepoint, - * including 11 bits of padding. The following functions convert from and to - * UTF-32. */ +/* This is functionally equivalent to the UTF-32-specific conversion functions + * but very slightly slower than each. */ +/* This operation is symmetrical; swab32(swab32(c)) will always return c. */ +/* big-endian | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */ +/* little-endian | TTTT SSSS | VVVV UUUU | */ +rune_t +swab32(rune_t c){ return + ((c & 0xFF000000) >> 24) + | ((c & 0x00FF0000) >> 8) + | ((c & 0x0000FF00) << 8) + | ((c & 0x000000FF) << 24); +} /* From , UTF-8 is encoded as follows: * The codepoint 0bZYYYY_XXXXWWWW_VVVVUUUU is nestled within the literal bits 0 * and 1; letters denote nybbles here, little-endian, and certain bits are * labeled, also little-endian. */ + +/* utf-8 bits |32 |24 |16 |8 + * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * + * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * + * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * + * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ +size_t +utf8_size(rune_t c){ return + 1 + ((c & 0x80000000) >> 31) /* 4B? */ + + ((c & 0x00800000) >> 23) /* 3B? */ + + ((c & 0x00008000) >> 15) /* 2B? */ + + ((c & 0x00000080) >> 7); /* 1B? */ +} + /* utf-8 bits |32 |27 |22 |14 |7 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * @@ -16,7 +39,7 @@ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ /* utf-32be bits |32 |21 |18 |12 |6 * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ -codepoint_t +rune_t utf8_to_utf32be(rune_t c){ return ((c & 0x07000000) >> 6) /* UTF-8 b27-25 -> UTF-32 b21-19 */ | (((c & 0x003F0000) /* UTF-8 b22-17 -> UTF-32 b18-13 */ @@ -24,6 +47,41 @@ utf8_to_utf32be(rune_t c){ return | ((c & 0x00003F00) >> 2) /* UTF-8 b14- 9 -> UTF-32 b12- 7 */ | (c & 0x0000007F); /* UTF-32 b 7- 1 */ } +/* utf-32be bits |32 |21 |18 |12 |6 + * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ +/* utf-8 bits |32 |27 |22 |14 |7 + * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * + * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * + * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * + * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ +/* m is the minimum amount of bytes into which to encode the codepoint c. If m + * is greater than 0, this function may return overlong-encoded UTF-8. */ +rune_t +utf32be_to_utf8(rune_t c, size_t m){ + rune_t r; + { size_t n; + if((n = utf8_size(c)) > m) + m = n; } /* This avoids calculating the size twice. */ + switch(m){ /* "Trin's device" if this is a novel use of a switch. */ + case 4: r = 0xF0000000 /* UTF-8 b32-29 */ + | ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */ + | 0x00800000 /* UTF-8 b24-23 */ + | ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */ + if(m == 3) + case 3: r = (0xE << 4) /* UTF-8 b24-21 */ + | ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */ + r |= 0x00008000 /* UTF-8 b16-15 */ + | ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */ + if(m == 2) + case 2: r = (0xE << 3) /* UTF-8 b16-14 */ + | ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */ + r |= 0x00000080 /* UTF-8 b 8- 7 */ + | (c & 0x00003F); /* UTF-8 b 6- 1 */ + if(m == 1) + case 1: r = c & 0x00007F; /* UTF-8 b 7- 1 */ + } + return r; +} /* is a good * explanation of this. */ @@ -31,25 +89,15 @@ utf8_to_utf32be(rune_t c){ return * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ /* utf-32le bits * U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */ -codepoint_t -utf32be_to_utf32le(codepoint_t c){ return +rune_t +utf32be_to_utf32le(rune_t c){ return ((c & 0x000000FF) << 24) | ((c & 0x0000FF00) << 8) | ((c & 0x001F0000) >> 8); } -codepoint_t -utf32le_to_utf32be(codepoint_t c){ return +rune_t +utf32le_to_utf32be(rune_t c){ return ((c & 0xFF000000) >> 24) | ((c & 0x00FF0000) >> 8) | ((c & 0x00001F00) << 8); } -/* This operation is symmetrical; swab32(swab32(c)) will always return c. It's - * (very slightly) slower than the specific UTF-32 conversion functions but may - * be useful. */ -codepoint_t -swab32(codepoint_t c){ return - ((c & 0xFF000000) >> 24) - | ((c & 0x00FF0000) >> 8) - | ((c & 0x0000FF00) << 8) - | ((c & 0x000000FF) << 24); -} diff --git a/src/libenc/libutf.h b/src/libenc/libutf.h index 6b004f3..feafde6 100644 --- a/src/libenc/libutf.h +++ b/src/libenc/libutf.h @@ -7,8 +7,10 @@ typedef uint32_t rune_t; * */ typedef unsigned long int rune_t; #endif +#include /* size_t */ rune_t swab32(rune_t c); rune_t utf8_to_utf32be(rune_t c); +rune_t utf32be_to_utf8(rune_t c, size_t m); rune_t utf32be_to_utf32le(rune_t c); rune_t utf32le_to_utf32be(rune_t c); -- 2.46.1 From be630e656e752d67ddd915e3dac71b946ede5bda Mon Sep 17 00:00:00 2001 From: DTB Date: Wed, 29 May 2024 16:45:31 -0600 Subject: [PATCH 3/8] libutf: clean up comments --- src/libenc/libutf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c index 8812577..f6276b3 100644 --- a/src/libenc/libutf.c +++ b/src/libenc/libutf.c @@ -5,7 +5,7 @@ * but very slightly slower than each. */ /* This operation is symmetrical; swab32(swab32(c)) will always return c. */ /* big-endian | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */ -/* little-endian | TTTT SSSS | VVVV UUUU | */ +/* little-endian | TTTT SSSS | VVVV UUUU | XXXX WWWW | ZZZZ YYYY */ rune_t swab32(rune_t c){ return ((c & 0xFF000000) >> 24) @@ -47,6 +47,7 @@ utf8_to_utf32be(rune_t c){ return | ((c & 0x00003F00) >> 2) /* UTF-8 b14- 9 -> UTF-32 b12- 7 */ | (c & 0x0000007F); /* UTF-32 b 7- 1 */ } + /* utf-32be bits |32 |21 |18 |12 |6 * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ /* utf-8 bits |32 |27 |22 |14 |7 -- 2.46.1 From 7d60b9cac69009420b1df974477418e3306ee6d2 Mon Sep 17 00:00:00 2001 From: DTB Date: Wed, 29 May 2024 19:12:47 -0600 Subject: [PATCH 4/8] libutf: utf8_to_chars --- src/libenc/libutf.c | 42 ++++++++++++++++++++++++++++++++++++++++-- src/libenc/libutf.h | 1 + 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c index f6276b3..56785d3 100644 --- a/src/libenc/libutf.c +++ b/src/libenc/libutf.c @@ -1,6 +1,23 @@ -#include /* size_t */ +#include /* size_t */ #include "libutf.h" +/* np is the pointer to the pointer to the next byte in a sequence. rp is the + * location to which the read UTF-8 rune will be stored. If np doesn't point to + * a valid UTF-8 rune, np and rp will be untouched and NULL will be returned. + */ +/* utf-8 bits |32 |24 |16 |8 + * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * + * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * + * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * + * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ +rune_t * +chars_to_utf8(char **np, rune_t *rp){ + size_t b; /* expected byte size of the rune at *np */ + rune_t r; +// for(); + return rp; +} + /* This is functionally equivalent to the UTF-32-specific conversion functions * but very slightly slower than each. */ /* This operation is symmetrical; swab32(swab32(c)) will always return c. */ @@ -32,6 +49,27 @@ utf8_size(rune_t c){ return + ((c & 0x00000080) >> 7); /* 1B? */ } +/* s should point to a big enough memory span of chars in which to store c, a + * (possibly invalid) UTF-8 rune. Returns a pointer to the memory location + * after the last written byte. Returns NULL if n is not 0 and n is less than + * the number of bytes that will be written. */ +char * +utf8_to_chars(rune_t c, char *s, size_t n){ + size_t i; + for(i = 0; (c & 0xFF000000) == 0 && i < 4; ++i) + c <<= 8; /* remove leading zero bytes */ + i = 4 - i; /* bytes in this rune */ + if(n != 0 && i > n) + return NULL; + switch(4 - i){ + case 4: *s++ = ((c & 0xFF000000) >> 24); c <<= 8; + case 3: *s++ = ((c & 0xFF000000) >> 24); c <<= 8; + case 2: *s++ = ((c & 0xFF000000) >> 24); c <<= 8; + case 1: *s++ = ((c & 0xFF000000) >> 24); + } + return s; +} + /* utf-8 bits |32 |27 |22 |14 |7 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * @@ -48,7 +86,7 @@ utf8_to_utf32be(rune_t c){ return | (c & 0x0000007F); /* UTF-32 b 7- 1 */ } -/* utf-32be bits |32 |21 |18 |12 |6 +/* utf-32be bits |32 |21 |18 |16 |12 |6 * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ /* utf-8 bits |32 |27 |22 |14 |7 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * diff --git a/src/libenc/libutf.h b/src/libenc/libutf.h index feafde6..8ca082d 100644 --- a/src/libenc/libutf.h +++ b/src/libenc/libutf.h @@ -11,6 +11,7 @@ typedef unsigned long int rune_t; rune_t swab32(rune_t c); rune_t utf8_to_utf32be(rune_t c); +char *utf8_to_chars(rune_t c, char *s, size_t n); rune_t utf32be_to_utf8(rune_t c, size_t m); rune_t utf32be_to_utf32le(rune_t c); rune_t utf32le_to_utf32be(rune_t c); -- 2.46.1 From 146ea609b695c337c206429a312dd94c5b2d6bc3 Mon Sep 17 00:00:00 2001 From: DTB Date: Wed, 29 May 2024 19:13:33 -0600 Subject: [PATCH 5/8] libutf: move to proper paths --- {src/libenc => include}/libutf.h | 0 src/{libenc => }/libutf.c | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {src/libenc => include}/libutf.h (100%) rename src/{libenc => }/libutf.c (100%) diff --git a/src/libenc/libutf.h b/include/libutf.h similarity index 100% rename from src/libenc/libutf.h rename to include/libutf.h diff --git a/src/libenc/libutf.c b/src/libutf.c similarity index 100% rename from src/libenc/libutf.c rename to src/libutf.c -- 2.46.1 From 55fdca9123330de58e9fa1d4f5dd8fa131bd93e5 Mon Sep 17 00:00:00 2001 From: DTB Date: Wed, 29 May 2024 19:56:41 -0600 Subject: [PATCH 6/8] libutf: clean up comments --- include/libutf.h | 20 ++++++++++++++++++++ src/libutf.c | 19 ++++--------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/include/libutf.h b/include/libutf.h index 8ca082d..f29bc96 100644 --- a/include/libutf.h +++ b/include/libutf.h @@ -9,9 +9,29 @@ typedef unsigned long int rune_t; #endif #include /* size_t */ +/* Reverses the order of the bytes in the 32-bit value c. */ rune_t swab32(rune_t c); + +/* Returns the byte length of a valid UTF-8 rune. */ +size_t utf8_size(rune_t c); + +/* Returns the UTF-32BE codepoint of the UTF-8 rune c. */ rune_t utf8_to_utf32be(rune_t c); + +/* Stores the UTF-8 rune c as bytes to the memory span s. s should point to a + * big enough memory span of chars in which to store c, a (possibly invalid) + * UTF-8 rune. Returns a pointer to the memory location after the last written + * byte. Returns NULL if n is not 0 and n is less than the number of bytes that + * will be written. */ char *utf8_to_chars(rune_t c, char *s, size_t n); + +/* Returns the UTF-8 encoding of the UTF-32BE codepoint c. m is the minimum + * amount of bytes into which to encode the codepoint c. If m is greater than + * 0, this function may return overlong-encoded UTF-8. */ rune_t utf32be_to_utf8(rune_t c, size_t m); + +/* Returns the UTF-32BE codepoint of the UTF-32LE codepoint c. */ rune_t utf32be_to_utf32le(rune_t c); + +/* Returns the UTF-32LE codepoint of the UTF-32BE codepoint c. */ rune_t utf32le_to_utf32be(rune_t c); diff --git a/src/libutf.c b/src/libutf.c index 56785d3..cc67a27 100644 --- a/src/libutf.c +++ b/src/libutf.c @@ -18,9 +18,6 @@ chars_to_utf8(char **np, rune_t *rp){ return rp; } -/* This is functionally equivalent to the UTF-32-specific conversion functions - * but very slightly slower than each. */ -/* This operation is symmetrical; swab32(swab32(c)) will always return c. */ /* big-endian | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */ /* little-endian | TTTT SSSS | VVVV UUUU | XXXX WWWW | ZZZZ YYYY */ rune_t @@ -49,10 +46,6 @@ utf8_size(rune_t c){ return + ((c & 0x00000080) >> 7); /* 1B? */ } -/* s should point to a big enough memory span of chars in which to store c, a - * (possibly invalid) UTF-8 rune. Returns a pointer to the memory location - * after the last written byte. Returns NULL if n is not 0 and n is less than - * the number of bytes that will be written. */ char * utf8_to_chars(rune_t c, char *s, size_t n){ size_t i; @@ -93,8 +86,6 @@ utf8_to_utf32be(rune_t c){ return * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ -/* m is the minimum amount of bytes into which to encode the codepoint c. If m - * is greater than 0, this function may return overlong-encoded UTF-8. */ rune_t utf32be_to_utf8(rune_t c, size_t m){ rune_t r; @@ -122,18 +113,16 @@ utf32be_to_utf8(rune_t c, size_t m){ return r; } -/* is a good - * explanation of this. */ -/* utf-32be bits - * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ -/* utf-32le bits - * U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */ +/* */ +/* utf-32be bits | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ +/* utf-32le bits | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */ rune_t utf32be_to_utf32le(rune_t c){ return ((c & 0x000000FF) << 24) | ((c & 0x0000FF00) << 8) | ((c & 0x001F0000) >> 8); } + rune_t utf32le_to_utf32be(rune_t c){ return ((c & 0xFF000000) >> 24) -- 2.46.1 From dc4091b43f98b65eecbf08167289966acf3e4b1a Mon Sep 17 00:00:00 2001 From: DTB Date: Wed, 29 May 2024 20:16:59 -0600 Subject: [PATCH 7/8] libutf: utf32be_to_utf8: fix constants --- src/libutf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libutf.c b/src/libutf.c index cc67a27..ab2bbb6 100644 --- a/src/libutf.c +++ b/src/libutf.c @@ -98,17 +98,17 @@ utf32be_to_utf8(rune_t c, size_t m){ | 0x00800000 /* UTF-8 b24-23 */ | ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */ if(m == 3) - case 3: r = (0xE << 4) /* UTF-8 b24-21 */ + case 3: r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */ | ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */ r |= 0x00008000 /* UTF-8 b16-15 */ | ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */ if(m == 2) - case 2: r = (0xE << 3) /* UTF-8 b16-14 */ + case 2: r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */ | ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */ r |= 0x00000080 /* UTF-8 b 8- 7 */ | (c & 0x00003F); /* UTF-8 b 6- 1 */ if(m == 1) - case 1: r = c & 0x00007F; /* UTF-8 b 7- 1 */ + case 1: r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */ } return r; } -- 2.46.1 From dc35c0142b12fde4e660131ba45237d9be131bf7 Mon Sep 17 00:00:00 2001 From: DTB Date: Fri, 31 May 2024 09:39:26 -0600 Subject: [PATCH 8/8] libutf: utf8_size: fix off-by-one for >1 retvals, utf32be_to_utf8: tweak to make more readable --- src/libutf.c | 64 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/src/libutf.c b/src/libutf.c index ab2bbb6..eec8f69 100644 --- a/src/libutf.c +++ b/src/libutf.c @@ -33,17 +33,17 @@ swab32(rune_t c){ return * and 1; letters denote nybbles here, little-endian, and certain bits are * labeled, also little-endian. */ -/* utf-8 bits |32 |24 |16 |8 +/* utf-8 bits |32 |24 |16 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ size_t utf8_size(rune_t c){ return - 1 + ((c & 0x80000000) >> 31) /* 4B? */ - + ((c & 0x00800000) >> 23) /* 3B? */ - + ((c & 0x00008000) >> 15) /* 2B? */ - + ((c & 0x00000080) >> 7); /* 1B? */ + ((c & 0x80000000) >> 31) /* 4B? */ + + ((c & 0x00800000) >> 23) /* 3B? */ + + ((c & 0x00008000) >> 15) /* 2B? */ + + 1; } char * @@ -88,27 +88,41 @@ utf8_to_utf32be(rune_t c){ return * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ rune_t utf32be_to_utf8(rune_t c, size_t m){ + size_t n; rune_t r; - { size_t n; - if((n = utf8_size(c)) > m) - m = n; } /* This avoids calculating the size twice. */ - switch(m){ /* "Trin's device" if this is a novel use of a switch. */ - case 4: r = 0xF0000000 /* UTF-8 b32-29 */ - | ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */ - | 0x00800000 /* UTF-8 b24-23 */ - | ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */ - if(m == 3) - case 3: r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */ - | ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */ - r |= 0x00008000 /* UTF-8 b16-15 */ - | ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */ - if(m == 2) - case 2: r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */ - | ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */ - r |= 0x00000080 /* UTF-8 b 8- 7 */ - | (c & 0x00003F); /* UTF-8 b 6- 1 */ - if(m == 1) - case 1: r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */ + + if((n = utf8_size(c)) > m) + m = n; + + /* In tested compilers this generates roughly the same assembly as the + * naive (no fallthroughs) approach. */ + switch(m){ + case 4: + r = 0xF0000000 /* UTF-8 b32-29 */ + | ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */ + + r |= 0x00800000 /* UTF-8 b24-23 */ + | ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */ + + if(0) /* if(m == 3) */ + case 3: + r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */ + | ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */ + + r |= 0x00008000 /* UTF-8 b16-15 */ + | ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */ + + if(0) /* if(m == 2) */ + case 2: + r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */ + | ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */ + + r |= 0x00000080 /* UTF-8 b 8- 7 */ + | (c & 0x00003F); /* UTF-8 b 6- 1 */ + + break; /* if(m == 1) */ + case 1: + r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */ } return r; } -- 2.46.1