libutf: utf8_to_chars

2024-05-29 19:12:47 -06:00 · 2024-05-29 19:12:47 -06:00 · 7d60b9cac6
commit 7d60b9cac6
parent be630e656e
2 changed files with 41 additions and 2 deletions
--- a/src/libenc/libutf.c
+++ b/src/libenc/libutf.c
@ -1,6 +1,23 @@
-#include <stdlib.h> /* size_t */
+#include <stddef.h> /* size_t */
 #include "libutf.h"

+/* np is the pointer to the pointer to the next byte in a sequence. rp is the
+ * location to which the read UTF-8 rune will be stored. If np doesn't point to
+ * a valid UTF-8 rune, np and rp will be untouched and NULL will be returned.
+ */
+/* utf-8 bits             |32         |24         |16         |8
+ * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
+ * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
+ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
+rune_t *
+chars_to_utf8(char **np, rune_t *rp){
+	size_t b; /* expected byte size of the rune at *np */
+	rune_t r;
+//	for();
+	return rp;
+}
+
 /* This is functionally equivalent to the UTF-32-specific conversion functions
 * but very slightly slower than each. */
 /* This operation is symmetrical; swab32(swab32(c)) will always return c. */
@ -32,6 +49,27 @@ utf8_size(rune_t c){ return
 	  + ((c & 0x00000080) >> 7); /* 1B? */
 }

+/* s should point to a big enough memory span of chars in which to store c, a
+ * (possibly invalid) UTF-8 rune. Returns a pointer to the memory location
+ * after the last written byte. Returns NULL if n is not 0 and n is less than
+ * the number of bytes that will be written. */
+char *
+utf8_to_chars(rune_t c, char *s, size_t n){
+	size_t i;
+	for(i = 0; (c & 0xFF000000) == 0 && i < 4; ++i)
+		c <<= 8; /* remove leading zero bytes */
+	i = 4 - i; /* bytes in this rune */
+	if(n != 0 && i > n)
+		return NULL;
+	switch(4 - i){
+	case 4: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
+	case 3: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
+	case 2: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
+	case 1: *s++ = ((c & 0xFF000000) >> 24);
+	}
+	return s;
+}
+
 /* utf-8 bits             |32   |27     |22         |14        |7
 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
 * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
@ -48,7 +86,7 @@ utf8_to_utf32be(rune_t c){ return
 	|   (c & 0x0000007F);                          /* UTF-32 b 7- 1 */
 }

-/* utf-32be bits          |32            |21 |18       |12      |6
+/* utf-32be bits          |32            |21 |18  |16  |12      |6
 * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
 /* utf-8 bits             |32   |27     |22         |14        |7
 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
--- a/src/libenc/libutf.h
+++ b/src/libenc/libutf.h
@ -11,6 +11,7 @@ typedef unsigned long int rune_t;

 rune_t swab32(rune_t c);
 rune_t utf8_to_utf32be(rune_t c);
+char *utf8_to_chars(rune_t c, char *s, size_t n);
 rune_t utf32be_to_utf8(rune_t c, size_t m);
 rune_t utf32be_to_utf32le(rune_t c);
 rune_t utf32le_to_utf32be(rune_t c);