libutf: utf8_to_chars
This commit is contained in:
parent
be630e656e
commit
7d60b9cac6
@ -1,6 +1,23 @@
|
||||
#include <stdlib.h> /* size_t */
|
||||
#include <stddef.h> /* size_t */
|
||||
#include "libutf.h"
|
||||
|
||||
/* np is the pointer to the pointer to the next byte in a sequence. rp is the
|
||||
* location to which the read UTF-8 rune will be stored. If np doesn't point to
|
||||
* a valid UTF-8 rune, np and rp will be untouched and NULL will be returned.
|
||||
*/
|
||||
/* utf-8 bits |32 |24 |16 |8
|
||||
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
|
||||
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
|
||||
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
|
||||
rune_t *
|
||||
chars_to_utf8(char **np, rune_t *rp){
|
||||
size_t b; /* expected byte size of the rune at *np */
|
||||
rune_t r;
|
||||
// for();
|
||||
return rp;
|
||||
}
|
||||
|
||||
/* This is functionally equivalent to the UTF-32-specific conversion functions
|
||||
* but very slightly slower than each. */
|
||||
/* This operation is symmetrical; swab32(swab32(c)) will always return c. */
|
||||
@ -32,6 +49,27 @@ utf8_size(rune_t c){ return
|
||||
+ ((c & 0x00000080) >> 7); /* 1B? */
|
||||
}
|
||||
|
||||
/* s should point to a big enough memory span of chars in which to store c, a
|
||||
* (possibly invalid) UTF-8 rune. Returns a pointer to the memory location
|
||||
* after the last written byte. Returns NULL if n is not 0 and n is less than
|
||||
* the number of bytes that will be written. */
|
||||
char *
|
||||
utf8_to_chars(rune_t c, char *s, size_t n){
|
||||
size_t i;
|
||||
for(i = 0; (c & 0xFF000000) == 0 && i < 4; ++i)
|
||||
c <<= 8; /* remove leading zero bytes */
|
||||
i = 4 - i; /* bytes in this rune */
|
||||
if(n != 0 && i > n)
|
||||
return NULL;
|
||||
switch(4 - i){
|
||||
case 4: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
|
||||
case 3: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
|
||||
case 2: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
|
||||
case 1: *s++ = ((c & 0xFF000000) >> 24);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/* utf-8 bits |32 |27 |22 |14 |7
|
||||
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
|
||||
@ -48,7 +86,7 @@ utf8_to_utf32be(rune_t c){ return
|
||||
| (c & 0x0000007F); /* UTF-32 b 7- 1 */
|
||||
}
|
||||
|
||||
/* utf-32be bits |32 |21 |18 |12 |6
|
||||
/* utf-32be bits |32 |21 |18 |16 |12 |6
|
||||
* U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
|
||||
/* utf-8 bits |32 |27 |22 |14 |7
|
||||
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||
|
@ -11,6 +11,7 @@ typedef unsigned long int rune_t;
|
||||
|
||||
rune_t swab32(rune_t c);
|
||||
rune_t utf8_to_utf32be(rune_t c);
|
||||
char *utf8_to_chars(rune_t c, char *s, size_t n);
|
||||
rune_t utf32be_to_utf8(rune_t c, size_t m);
|
||||
rune_t utf32be_to_utf32le(rune_t c);
|
||||
rune_t utf32le_to_utf32be(rune_t c);
|
||||
|
Loading…
Reference in New Issue
Block a user