From 7d60b9cac69009420b1df974477418e3306ee6d2 Mon Sep 17 00:00:00 2001 From: DTB Date: Wed, 29 May 2024 19:12:47 -0600 Subject: [PATCH] libutf: utf8_to_chars --- src/libenc/libutf.c | 42 ++++++++++++++++++++++++++++++++++++++++-- src/libenc/libutf.h | 1 + 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/libenc/libutf.c b/src/libenc/libutf.c index f6276b3..56785d3 100644 --- a/src/libenc/libutf.c +++ b/src/libenc/libutf.c @@ -1,6 +1,23 @@ -#include /* size_t */ +#include /* size_t */ #include "libutf.h" +/* np is the pointer to the pointer to the next byte in a sequence. rp is the + * location to which the read UTF-8 rune will be stored. If np doesn't point to + * a valid UTF-8 rune, np and rp will be untouched and NULL will be returned. + */ +/* utf-8 bits |32 |24 |16 |8 + * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * + * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * + * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * + * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ +rune_t * +chars_to_utf8(char **np, rune_t *rp){ + size_t b; /* expected byte size of the rune at *np */ + rune_t r; +// for(); + return rp; +} + /* This is functionally equivalent to the UTF-32-specific conversion functions * but very slightly slower than each. */ /* This operation is symmetrical; swab32(swab32(c)) will always return c. */ @@ -32,6 +49,27 @@ utf8_size(rune_t c){ return + ((c & 0x00000080) >> 7); /* 1B? */ } +/* s should point to a big enough memory span of chars in which to store c, a + * (possibly invalid) UTF-8 rune. Returns a pointer to the memory location + * after the last written byte. Returns NULL if n is not 0 and n is less than + * the number of bytes that will be written. */ +char * +utf8_to_chars(rune_t c, char *s, size_t n){ + size_t i; + for(i = 0; (c & 0xFF000000) == 0 && i < 4; ++i) + c <<= 8; /* remove leading zero bytes */ + i = 4 - i; /* bytes in this rune */ + if(n != 0 && i > n) + return NULL; + switch(4 - i){ + case 4: *s++ = ((c & 0xFF000000) >> 24); c <<= 8; + case 3: *s++ = ((c & 0xFF000000) >> 24); c <<= 8; + case 2: *s++ = ((c & 0xFF000000) >> 24); c <<= 8; + case 1: *s++ = ((c & 0xFF000000) >> 24); + } + return s; +} + /* utf-8 bits |32 |27 |22 |14 |7 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * @@ -48,7 +86,7 @@ utf8_to_utf32be(rune_t c){ return | (c & 0x0000007F); /* UTF-32 b 7- 1 */ } -/* utf-32be bits |32 |21 |18 |12 |6 +/* utf-32be bits |32 |21 |18 |16 |12 |6 * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ /* utf-8 bits |32 |27 |22 |14 |7 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * diff --git a/src/libenc/libutf.h b/src/libenc/libutf.h index feafde6..8ca082d 100644 --- a/src/libenc/libutf.h +++ b/src/libenc/libutf.h @@ -11,6 +11,7 @@ typedef unsigned long int rune_t; rune_t swab32(rune_t c); rune_t utf8_to_utf32be(rune_t c); +char *utf8_to_chars(rune_t c, char *s, size_t n); rune_t utf32be_to_utf8(rune_t c, size_t m); rune_t utf32be_to_utf32le(rune_t c); rune_t utf32le_to_utf32be(rune_t c);