From 2479ab63d016e93b7ebaa5f05d875250b5210687 Mon Sep 17 00:00:00 2001
From: DTB <trinity@trinity.moe>
Date: Mon, 27 May 2024 17:38:35 -0600
Subject: [PATCH 1/8] libutf: UTF conversion functionality

---
 src/libenc/libutf.c | 55 +++++++++++++++++++++++++++++++++++++++++++++
 src/libenc/libutf.h | 14 ++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 src/libenc/libutf.c
 create mode 100644 src/libenc/libutf.h

diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c
new file mode 100644
index 0000000..ef5cf36
--- /dev/null
+++ b/src/libenc/libutf.c
@@ -0,0 +1,55 @@
+#include <stdlib.h> /* size_t */
+#include "libutf.h"
+
+/* UTF-32BE is the big-endian literal encoding of a Unicode codepoint,
+ * including 11 bits of padding. The following functions convert from and to
+ * UTF-32. */
+
+/* From <https://en.wikipedia.org/wiki/UTF-8>, UTF-8 is encoded as follows:
+ * The codepoint 0bZYYYY_XXXXWWWW_VVVVUUUU is nestled within the literal bits 0
+ * and 1; letters denote nybbles here, little-endian, and certain bits are
+ * labeled, also little-endian. */
+/* utf-8 bits             |32   |27     |22         |14        |7
+ * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
+ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
+/* utf-32be bits          |32            |21 |18       |12      |6
+ * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
+codepoint_t
+utf8_to_utf32be(rune_t c){ return
+	   ((c & 0x07000000) >> 6)     /* UTF-8 b27-25 -> UTF-32 b21-19 */
+	| (((c & 0x003F0000)           /* UTF-8 b22-17 -> UTF-32 b18-13 */
+		^ ((c & (1 << 22)) >> 1)) >> 4) /* (if UTF-8 b23, zero b22) */
+	|  ((c & 0x00003F00) >> 2)     /* UTF-8 b14- 9 -> UTF-32 b12- 7 */
+	|   (c & 0x0000007F);                          /* UTF-32 b 7- 1 */
+}
+
+/* <https://www.herongyang.com/Unicode/UTF-32-UTF-32-Encoding.html> is a good
+ * explanation of this. */
+/* utf-32be bits
+ * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
+/* utf-32le bits
+ * U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */
+codepoint_t
+utf32be_to_utf32le(codepoint_t c){ return
+	  ((c & 0x000000FF) << 24)
+	| ((c & 0x0000FF00) << 8)
+	| ((c & 0x001F0000) >> 8);
+}
+codepoint_t
+utf32le_to_utf32be(codepoint_t c){ return
+	  ((c & 0xFF000000) >> 24)
+	| ((c & 0x00FF0000) >> 8)
+	| ((c & 0x00001F00) << 8);
+}
+/* This operation is symmetrical; swab32(swab32(c)) will always return c. It's
+ * (very slightly) slower than the specific UTF-32 conversion functions but may
+ * be useful. */
+codepoint_t
+swab32(codepoint_t c){ return
+	  ((c & 0xFF000000) >> 24)
+	| ((c & 0x00FF0000) >> 8)
+	| ((c & 0x0000FF00) << 8)
+	| ((c & 0x000000FF) << 24);
+}
diff --git a/src/libenc/libutf.h b/src/libenc/libutf.h
new file mode 100644
index 0000000..6b004f3
--- /dev/null
+++ b/src/libenc/libutf.h
@@ -0,0 +1,14 @@
+#if __STDC_VERSION__ >= 199901L
+/* C99 type definitions */
+#	include <stdint.h>
+typedef uint32_t rune_t;
+#else
+/* Must hold at least 32b; see the C89 draft 2.2.4.2
+ * <http://jfxpt.com/library/c89-draft.html#2.2.4.2> */
+typedef unsigned long int rune_t;
+#endif
+
+rune_t swab32(rune_t c);
+rune_t utf8_to_utf32be(rune_t c);
+rune_t utf32be_to_utf32le(rune_t c);
+rune_t utf32le_to_utf32be(rune_t c);
-- 
2.46.1


From 7ed5c95e0f4bee427c48150dab92ae3229bc8c68 Mon Sep 17 00:00:00 2001
From: DTB <trinity@trinity.moe>
Date: Wed, 29 May 2024 16:32:31 -0600
Subject: [PATCH 2/8] libutf: utf32be_to_utf8

---
 src/libenc/libutf.c | 84 +++++++++++++++++++++++++++++++++++----------
 src/libenc/libutf.h |  2 ++
 2 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c
index ef5cf36..8812577 100644
--- a/src/libenc/libutf.c
+++ b/src/libenc/libutf.c
@@ -1,14 +1,37 @@
 #include <stdlib.h> /* size_t */
 #include "libutf.h"
 
-/* UTF-32BE is the big-endian literal encoding of a Unicode codepoint,
- * including 11 bits of padding. The following functions convert from and to
- * UTF-32. */
+/* This is functionally equivalent to the UTF-32-specific conversion functions
+ * but very slightly slower than each. */
+/* This operation is symmetrical; swab32(swab32(c)) will always return c. */
+/* big-endian           | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */
+/* little-endian        | TTTT SSSS | VVVV UUUU | */
+rune_t
+swab32(rune_t c){ return
+	  ((c & 0xFF000000) >> 24)
+	| ((c & 0x00FF0000) >> 8)
+	| ((c & 0x0000FF00) << 8)
+	| ((c & 0x000000FF) << 24);
+}
 
 /* From <https://en.wikipedia.org/wiki/UTF-8>, UTF-8 is encoded as follows:
  * The codepoint 0bZYYYY_XXXXWWWW_VVVVUUUU is nestled within the literal bits 0
  * and 1; letters denote nybbles here, little-endian, and certain bits are
  * labeled, also little-endian. */
+
+/* utf-8 bits             |32         |24         |16         |8
+ * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
+ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
+size_t
+utf8_size(rune_t c){ return
+	1 + ((c & 0x80000000) >> 31) /* 4B? */
+	  + ((c & 0x00800000) >> 23) /* 3B? */
+	  + ((c & 0x00008000) >> 15) /* 2B? */
+	  + ((c & 0x00000080) >> 7); /* 1B? */
+}
+
 /* utf-8 bits             |32   |27     |22         |14        |7
  * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
  * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
@@ -16,7 +39,7 @@
  * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
 /* utf-32be bits          |32            |21 |18       |12      |6
  * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
-codepoint_t
+rune_t
 utf8_to_utf32be(rune_t c){ return
 	   ((c & 0x07000000) >> 6)     /* UTF-8 b27-25 -> UTF-32 b21-19 */
 	| (((c & 0x003F0000)           /* UTF-8 b22-17 -> UTF-32 b18-13 */
@@ -24,6 +47,41 @@ utf8_to_utf32be(rune_t c){ return
 	|  ((c & 0x00003F00) >> 2)     /* UTF-8 b14- 9 -> UTF-32 b12- 7 */
 	|   (c & 0x0000007F);                          /* UTF-32 b 7- 1 */
 }
+/* utf-32be bits          |32            |21 |18       |12      |6
+ * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
+/* utf-8 bits             |32   |27     |22         |14        |7
+ * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
+ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
+/* m is the minimum amount of bytes into which to encode the codepoint c. If m
+ * is greater than 0, this function may return overlong-encoded UTF-8. */
+rune_t
+utf32be_to_utf8(rune_t c, size_t m){
+	rune_t r;
+	{	size_t n;
+		if((n = utf8_size(c)) > m)
+			m = n; } /* This avoids calculating the size twice. */
+	switch(m){ /* "Trin's device" if this is a novel use of a switch. */
+	case 4: r =   0xF0000000                              /* UTF-8 b32-29 */
+			    | ((c & 0x1C0000) << 6)  /* UTF-32 b21-19 -> UTF-8 b27-25 */
+				| 0x00800000                              /* UTF-8 b24-23 */
+				| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
+	if(m == 3)
+	case 3:	r =   (0xE << 4)                              /* UTF-8 b24-21 */
+				| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
+			r |=  0x00008000                              /* UTF-8 b16-15 */
+				| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
+	if(m == 2)
+	case 2: r =   (0xE << 3)                              /* UTF-8 b16-14 */
+				| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
+			r |=  0x00000080                              /* UTF-8 b 8- 7 */
+				| (c & 0x00003F);                         /* UTF-8 b 6- 1 */
+	if(m == 1)
+	case 1: r = c & 0x00007F;                             /* UTF-8 b 7- 1 */
+	}
+	return r;
+}
 
 /* <https://www.herongyang.com/Unicode/UTF-32-UTF-32-Encoding.html> is a good
  * explanation of this. */
@@ -31,25 +89,15 @@ utf8_to_utf32be(rune_t c){ return
  * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
 /* utf-32le bits
  * U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */
-codepoint_t
-utf32be_to_utf32le(codepoint_t c){ return
+rune_t
+utf32be_to_utf32le(rune_t c){ return
 	  ((c & 0x000000FF) << 24)
 	| ((c & 0x0000FF00) << 8)
 	| ((c & 0x001F0000) >> 8);
 }
-codepoint_t
-utf32le_to_utf32be(codepoint_t c){ return
+rune_t
+utf32le_to_utf32be(rune_t c){ return
 	  ((c & 0xFF000000) >> 24)
 	| ((c & 0x00FF0000) >> 8)
 	| ((c & 0x00001F00) << 8);
 }
-/* This operation is symmetrical; swab32(swab32(c)) will always return c. It's
- * (very slightly) slower than the specific UTF-32 conversion functions but may
- * be useful. */
-codepoint_t
-swab32(codepoint_t c){ return
-	  ((c & 0xFF000000) >> 24)
-	| ((c & 0x00FF0000) >> 8)
-	| ((c & 0x0000FF00) << 8)
-	| ((c & 0x000000FF) << 24);
-}
diff --git a/src/libenc/libutf.h b/src/libenc/libutf.h
index 6b004f3..feafde6 100644
--- a/src/libenc/libutf.h
+++ b/src/libenc/libutf.h
@@ -7,8 +7,10 @@ typedef uint32_t rune_t;
  * <http://jfxpt.com/library/c89-draft.html#2.2.4.2> */
 typedef unsigned long int rune_t;
 #endif
+#include <stddef.h> /* size_t */
 
 rune_t swab32(rune_t c);
 rune_t utf8_to_utf32be(rune_t c);
+rune_t utf32be_to_utf8(rune_t c, size_t m);
 rune_t utf32be_to_utf32le(rune_t c);
 rune_t utf32le_to_utf32be(rune_t c);
-- 
2.46.1


From be630e656e752d67ddd915e3dac71b946ede5bda Mon Sep 17 00:00:00 2001
From: DTB <trinity@trinity.moe>
Date: Wed, 29 May 2024 16:45:31 -0600
Subject: [PATCH 3/8] libutf: clean up comments

---
 src/libenc/libutf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c
index 8812577..f6276b3 100644
--- a/src/libenc/libutf.c
+++ b/src/libenc/libutf.c
@@ -5,7 +5,7 @@
  * but very slightly slower than each. */
 /* This operation is symmetrical; swab32(swab32(c)) will always return c. */
 /* big-endian           | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */
-/* little-endian        | TTTT SSSS | VVVV UUUU | */
+/* little-endian        | TTTT SSSS | VVVV UUUU | XXXX WWWW | ZZZZ YYYY */
 rune_t
 swab32(rune_t c){ return
 	  ((c & 0xFF000000) >> 24)
@@ -47,6 +47,7 @@ utf8_to_utf32be(rune_t c){ return
 	|  ((c & 0x00003F00) >> 2)     /* UTF-8 b14- 9 -> UTF-32 b12- 7 */
 	|   (c & 0x0000007F);                          /* UTF-32 b 7- 1 */
 }
+
 /* utf-32be bits          |32            |21 |18       |12      |6
  * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
 /* utf-8 bits             |32   |27     |22         |14        |7
-- 
2.46.1


From 7d60b9cac69009420b1df974477418e3306ee6d2 Mon Sep 17 00:00:00 2001
From: DTB <trinity@trinity.moe>
Date: Wed, 29 May 2024 19:12:47 -0600
Subject: [PATCH 4/8] libutf: utf8_to_chars

---
 src/libenc/libutf.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 src/libenc/libutf.h |  1 +
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c
index f6276b3..56785d3 100644
--- a/src/libenc/libutf.c
+++ b/src/libenc/libutf.c
@@ -1,6 +1,23 @@
-#include <stdlib.h> /* size_t */
+#include <stddef.h> /* size_t */
 #include "libutf.h"
 
+/* np is the pointer to the pointer to the next byte in a sequence. rp is the
+ * location to which the read UTF-8 rune will be stored. If np doesn't point to
+ * a valid UTF-8 rune, np and rp will be untouched and NULL will be returned.
+ */
+/* utf-8 bits             |32         |24         |16         |8
+ * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
+ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
+rune_t *
+chars_to_utf8(char **np, rune_t *rp){
+	size_t b; /* expected byte size of the rune at *np */
+	rune_t r;
+//	for();
+	return rp;
+}
+
 /* This is functionally equivalent to the UTF-32-specific conversion functions
  * but very slightly slower than each. */
 /* This operation is symmetrical; swab32(swab32(c)) will always return c. */
@@ -32,6 +49,27 @@ utf8_size(rune_t c){ return
 	  + ((c & 0x00000080) >> 7); /* 1B? */
 }
 
+/* s should point to a big enough memory span of chars in which to store c, a
+ * (possibly invalid) UTF-8 rune. Returns a pointer to the memory location
+ * after the last written byte. Returns NULL if n is not 0 and n is less than
+ * the number of bytes that will be written. */
+char *
+utf8_to_chars(rune_t c, char *s, size_t n){
+	size_t i;
+	for(i = 0; (c & 0xFF000000) == 0 && i < 4; ++i)
+		c <<= 8; /* remove leading zero bytes */
+	i = 4 - i; /* bytes in this rune */
+	if(n != 0 && i > n)
+		return NULL;
+	switch(4 - i){
+	case 4: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
+	case 3: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
+	case 2: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
+	case 1: *s++ = ((c & 0xFF000000) >> 24);
+	}
+	return s;
+}
+
 /* utf-8 bits             |32   |27     |22         |14        |7
  * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
  * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
@@ -48,7 +86,7 @@ utf8_to_utf32be(rune_t c){ return
 	|   (c & 0x0000007F);                          /* UTF-32 b 7- 1 */
 }
 
-/* utf-32be bits          |32            |21 |18       |12      |6
+/* utf-32be bits          |32            |21 |18  |16  |12      |6
  * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
 /* utf-8 bits             |32   |27     |22         |14        |7
  * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
diff --git a/src/libenc/libutf.h b/src/libenc/libutf.h
index feafde6..8ca082d 100644
--- a/src/libenc/libutf.h
+++ b/src/libenc/libutf.h
@@ -11,6 +11,7 @@ typedef unsigned long int rune_t;
 
 rune_t swab32(rune_t c);
 rune_t utf8_to_utf32be(rune_t c);
+char *utf8_to_chars(rune_t c, char *s, size_t n);
 rune_t utf32be_to_utf8(rune_t c, size_t m);
 rune_t utf32be_to_utf32le(rune_t c);
 rune_t utf32le_to_utf32be(rune_t c);
-- 
2.46.1


From 146ea609b695c337c206429a312dd94c5b2d6bc3 Mon Sep 17 00:00:00 2001
From: DTB <trinity@trinity.moe>
Date: Wed, 29 May 2024 19:13:33 -0600
Subject: [PATCH 5/8] libutf: move to proper paths

---
 {src/libenc => include}/libutf.h | 0
 src/{libenc => }/libutf.c        | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {src/libenc => include}/libutf.h (100%)
 rename src/{libenc => }/libutf.c (100%)

diff --git a/src/libenc/libutf.h b/include/libutf.h
similarity index 100%
rename from src/libenc/libutf.h
rename to include/libutf.h
diff --git a/src/libenc/libutf.c b/src/libutf.c
similarity index 100%
rename from src/libenc/libutf.c
rename to src/libutf.c
-- 
2.46.1


From 55fdca9123330de58e9fa1d4f5dd8fa131bd93e5 Mon Sep 17 00:00:00 2001
From: DTB <trinity@trinity.moe>
Date: Wed, 29 May 2024 19:56:41 -0600
Subject: [PATCH 6/8] libutf: clean up comments

---
 include/libutf.h | 20 ++++++++++++++++++++
 src/libutf.c     | 19 ++++---------------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/include/libutf.h b/include/libutf.h
index 8ca082d..f29bc96 100644
--- a/include/libutf.h
+++ b/include/libutf.h
@@ -9,9 +9,29 @@ typedef unsigned long int rune_t;
 #endif
 #include <stddef.h> /* size_t */
 
+/* Reverses the order of the bytes in the 32-bit value c. */
 rune_t swab32(rune_t c);
+
+/* Returns the byte length of a valid UTF-8 rune. */
+size_t utf8_size(rune_t c);
+
+/* Returns the UTF-32BE codepoint of the UTF-8 rune c. */
 rune_t utf8_to_utf32be(rune_t c);
+
+/* Stores the UTF-8 rune c as bytes to the memory span s. s should point to a
+ * big enough memory span of chars in which to store c, a (possibly invalid)
+ * UTF-8 rune. Returns a pointer to the memory location after the last written
+ * byte. Returns NULL if n is not 0 and n is less than the number of bytes that
+ * will be written. */
 char *utf8_to_chars(rune_t c, char *s, size_t n);
+
+/* Returns the UTF-8 encoding of the UTF-32BE codepoint c. m is the minimum
+ * amount of bytes into which to encode the codepoint c. If m is greater than
+ * 0, this function may return overlong-encoded UTF-8. */
 rune_t utf32be_to_utf8(rune_t c, size_t m);
+
+/* Returns the UTF-32BE codepoint of the UTF-32LE codepoint c. */
 rune_t utf32be_to_utf32le(rune_t c);
+
+/* Returns the UTF-32LE codepoint of the UTF-32BE codepoint c. */
 rune_t utf32le_to_utf32be(rune_t c);
diff --git a/src/libutf.c b/src/libutf.c
index 56785d3..cc67a27 100644
--- a/src/libutf.c
+++ b/src/libutf.c
@@ -18,9 +18,6 @@ chars_to_utf8(char **np, rune_t *rp){
 	return rp;
 }
 
-/* This is functionally equivalent to the UTF-32-specific conversion functions
- * but very slightly slower than each. */
-/* This operation is symmetrical; swab32(swab32(c)) will always return c. */
 /* big-endian           | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */
 /* little-endian        | TTTT SSSS | VVVV UUUU | XXXX WWWW | ZZZZ YYYY */
 rune_t
@@ -49,10 +46,6 @@ utf8_size(rune_t c){ return
 	  + ((c & 0x00000080) >> 7); /* 1B? */
 }
 
-/* s should point to a big enough memory span of chars in which to store c, a
- * (possibly invalid) UTF-8 rune. Returns a pointer to the memory location
- * after the last written byte. Returns NULL if n is not 0 and n is less than
- * the number of bytes that will be written. */
 char *
 utf8_to_chars(rune_t c, char *s, size_t n){
 	size_t i;
@@ -93,8 +86,6 @@ utf8_to_utf32be(rune_t c){ return
  * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
  * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
  * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
-/* m is the minimum amount of bytes into which to encode the codepoint c. If m
- * is greater than 0, this function may return overlong-encoded UTF-8. */
 rune_t
 utf32be_to_utf8(rune_t c, size_t m){
 	rune_t r;
@@ -122,18 +113,16 @@ utf32be_to_utf8(rune_t c, size_t m){
 	return r;
 }
 
-/* <https://www.herongyang.com/Unicode/UTF-32-UTF-32-Encoding.html> is a good
- * explanation of this. */
-/* utf-32be bits
- * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
-/* utf-32le bits
- * U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */
+/* <https://www.herongyang.com/Unicode/UTF-32-UTF-32-Encoding.html> */
+/* utf-32be bits        | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
+/* utf-32le bits        | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */
 rune_t
 utf32be_to_utf32le(rune_t c){ return
 	  ((c & 0x000000FF) << 24)
 	| ((c & 0x0000FF00) << 8)
 	| ((c & 0x001F0000) >> 8);
 }
+
 rune_t
 utf32le_to_utf32be(rune_t c){ return
 	  ((c & 0xFF000000) >> 24)
-- 
2.46.1


From dc4091b43f98b65eecbf08167289966acf3e4b1a Mon Sep 17 00:00:00 2001
From: DTB <trinity@trinity.moe>
Date: Wed, 29 May 2024 20:16:59 -0600
Subject: [PATCH 7/8] libutf: utf32be_to_utf8: fix constants

---
 src/libutf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/libutf.c b/src/libutf.c
index cc67a27..ab2bbb6 100644
--- a/src/libutf.c
+++ b/src/libutf.c
@@ -98,17 +98,17 @@ utf32be_to_utf8(rune_t c, size_t m){
 				| 0x00800000                              /* UTF-8 b24-23 */
 				| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
 	if(m == 3)
-	case 3:	r =   (0xE << 4)                              /* UTF-8 b24-21 */
+	case 3:	r =   0x00E00000  /* 0xE == 0b1110 */         /* UTF-8 b24-21 */
 				| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
 			r |=  0x00008000                              /* UTF-8 b16-15 */
 				| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
 	if(m == 2)
-	case 2: r =   (0xE << 3)                              /* UTF-8 b16-14 */
+	case 2: r =   0x0000C000  /* 0xC == 0b1100 */         /* UTF-8 b16-14 */
 				| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
 			r |=  0x00000080                              /* UTF-8 b 8- 7 */
 				| (c & 0x00003F);                         /* UTF-8 b 6- 1 */
 	if(m == 1)
-	case 1: r = c & 0x00007F;                             /* UTF-8 b 7- 1 */
+	case 1: r = c & 0x00007F; /* 0x7 == 0b0111 */         /* UTF-8 b 7- 1 */
 	}
 	return r;
 }
-- 
2.46.1


From dc35c0142b12fde4e660131ba45237d9be131bf7 Mon Sep 17 00:00:00 2001
From: DTB <trinity@trinity.moe>
Date: Fri, 31 May 2024 09:39:26 -0600
Subject: [PATCH 8/8] libutf: utf8_size: fix off-by-one for >1 retvals,
 utf32be_to_utf8: tweak to make more readable

---
 src/libutf.c | 64 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/src/libutf.c b/src/libutf.c
index ab2bbb6..eec8f69 100644
--- a/src/libutf.c
+++ b/src/libutf.c
@@ -33,17 +33,17 @@ swab32(rune_t c){ return
  * and 1; letters denote nybbles here, little-endian, and certain bits are
  * labeled, also little-endian. */
 
-/* utf-8 bits             |32         |24         |16         |8
+/* utf-8 bits             |32         |24         |16
  * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
  * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
  * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
  * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
 size_t
 utf8_size(rune_t c){ return
-	1 + ((c & 0x80000000) >> 31) /* 4B? */
-	  + ((c & 0x00800000) >> 23) /* 3B? */
-	  + ((c & 0x00008000) >> 15) /* 2B? */
-	  + ((c & 0x00000080) >> 7); /* 1B? */
+	  ((c & 0x80000000) >> 31) /* 4B? */
+	+ ((c & 0x00800000) >> 23) /* 3B? */
+	+ ((c & 0x00008000) >> 15) /* 2B? */
+	+ 1;
 }
 
 char *
@@ -88,27 +88,41 @@ utf8_to_utf32be(rune_t c){ return
  * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
 rune_t
 utf32be_to_utf8(rune_t c, size_t m){
+	size_t n;
 	rune_t r;
-	{	size_t n;
-		if((n = utf8_size(c)) > m)
-			m = n; } /* This avoids calculating the size twice. */
-	switch(m){ /* "Trin's device" if this is a novel use of a switch. */
-	case 4: r =   0xF0000000                              /* UTF-8 b32-29 */
-			    | ((c & 0x1C0000) << 6)  /* UTF-32 b21-19 -> UTF-8 b27-25 */
-				| 0x00800000                              /* UTF-8 b24-23 */
-				| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
-	if(m == 3)
-	case 3:	r =   0x00E00000  /* 0xE == 0b1110 */         /* UTF-8 b24-21 */
-				| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
-			r |=  0x00008000                              /* UTF-8 b16-15 */
-				| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
-	if(m == 2)
-	case 2: r =   0x0000C000  /* 0xC == 0b1100 */         /* UTF-8 b16-14 */
-				| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
-			r |=  0x00000080                              /* UTF-8 b 8- 7 */
-				| (c & 0x00003F);                         /* UTF-8 b 6- 1 */
-	if(m == 1)
-	case 1: r = c & 0x00007F; /* 0x7 == 0b0111 */         /* UTF-8 b 7- 1 */
+
+	if((n = utf8_size(c)) > m)
+		m = n;
+
+	/* In tested compilers this generates roughly the same assembly as the
+	 * naive (no fallthroughs) approach. */
+	switch(m){
+	case 4:
+		r =   0xF0000000                              /* UTF-8 b32-29 */
+		    | ((c & 0x1C0000) << 6)  /* UTF-32 b21-19 -> UTF-8 b27-25 */
+
+	r |=  0x00800000                                  /* UTF-8 b24-23 */
+	    | ((c & 0x03F000) << 4);     /* UTF-32 b18-13 -> UTF-8 b24-17 */
+
+	if(0)  /* if(m == 3) */
+	case 3:
+		r =   0x00E00000  /* 0xE == 0b1110 */         /* UTF-8 b24-21 */
+		    | ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
+
+	r |=  0x00008000                                  /* UTF-8 b16-15 */
+	    | ((c & 0x000FC0) << 2);     /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
+
+	if(0)  /* if(m == 2) */
+	case 2:
+		r =   0x0000C000  /* 0xC == 0b1100 */         /* UTF-8 b16-14 */
+		    | ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
+
+	r |=  0x00000080                                  /* UTF-8 b 8- 7 */
+	    | (c & 0x00003F);                             /* UTF-8 b 6- 1 */
+
+	break; /* if(m == 1) */
+	case 1:
+		r = c & 0x00007F; /* 0x7 == 0b0111 */         /* UTF-8 b 7- 1 */
 	}
 	return r;
 }
-- 
2.46.1