libutf: utf8_to_chars

This commit is contained in:
dtb 2024-05-29 19:12:47 -06:00
parent be630e656e
commit 7d60b9cac6
Signed by: trinity
GPG Key ID: 6CDF230C17CC3349
2 changed files with 41 additions and 2 deletions

View File

@ -1,6 +1,23 @@
#include <stdlib.h> /* size_t */ #include <stddef.h> /* size_t */
#include "libutf.h" #include "libutf.h"
/* np is the pointer to the pointer to the next byte in a sequence. rp is the
* location to which the read UTF-8 rune will be stored. If np doesn't point to
* a valid UTF-8 rune, np and rp will be untouched and NULL will be returned.
*/
/* utf-8 bits |32 |24 |16 |8
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
rune_t *
chars_to_utf8(char **np, rune_t *rp){
size_t b; /* expected byte size of the rune at *np */
rune_t r;
// for();
return rp;
}
/* This is functionally equivalent to the UTF-32-specific conversion functions /* This is functionally equivalent to the UTF-32-specific conversion functions
* but very slightly slower than each. */ * but very slightly slower than each. */
/* This operation is symmetrical; swab32(swab32(c)) will always return c. */ /* This operation is symmetrical; swab32(swab32(c)) will always return c. */
@ -32,6 +49,27 @@ utf8_size(rune_t c){ return
+ ((c & 0x00000080) >> 7); /* 1B? */ + ((c & 0x00000080) >> 7); /* 1B? */
} }
/* s should point to a big enough memory span of chars in which to store c, a
* (possibly invalid) UTF-8 rune. Returns a pointer to the memory location
* after the last written byte. Returns NULL if n is not 0 and n is less than
* the number of bytes that will be written. */
char *
utf8_to_chars(rune_t c, char *s, size_t n){
size_t i;
for(i = 0; (c & 0xFF000000) == 0 && i < 4; ++i)
c <<= 8; /* remove leading zero bytes */
i = 4 - i; /* bytes in this rune */
if(n != 0 && i > n)
return NULL;
switch(4 - i){
case 4: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
case 3: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
case 2: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
case 1: *s++ = ((c & 0xFF000000) >> 24);
}
return s;
}
/* utf-8 bits |32 |27 |22 |14 |7 /* utf-8 bits |32 |27 |22 |14 |7
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
@ -48,7 +86,7 @@ utf8_to_utf32be(rune_t c){ return
| (c & 0x0000007F); /* UTF-32 b 7- 1 */ | (c & 0x0000007F); /* UTF-32 b 7- 1 */
} }
/* utf-32be bits |32 |21 |18 |12 |6 /* utf-32be bits |32 |21 |18 |16 |12 |6
* U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */ * U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
/* utf-8 bits |32 |27 |22 |14 |7 /* utf-8 bits |32 |27 |22 |14 |7
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *

View File

@ -11,6 +11,7 @@ typedef unsigned long int rune_t;
rune_t swab32(rune_t c); rune_t swab32(rune_t c);
rune_t utf8_to_utf32be(rune_t c); rune_t utf8_to_utf32be(rune_t c);
char *utf8_to_chars(rune_t c, char *s, size_t n);
rune_t utf32be_to_utf8(rune_t c, size_t m); rune_t utf32be_to_utf8(rune_t c, size_t m);
rune_t utf32be_to_utf32le(rune_t c); rune_t utf32be_to_utf32le(rune_t c);
rune_t utf32le_to_utf32be(rune_t c); rune_t utf32le_to_utf32be(rune_t c);