WIP: utf(1) #141
37
include/libutf.h
Normal file
37
include/libutf.h
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#if __STDC_VERSION__ >= 199901L
|
||||||
|
/* C99 type definitions */
|
||||||
|
# include <stdint.h>
|
||||||
|
typedef uint32_t rune_t;
|
||||||
|
#else
|
||||||
|
/* Must hold at least 32b; see the C89 draft 2.2.4.2
|
||||||
|
* <http://jfxpt.com/library/c89-draft.html#2.2.4.2> */
|
||||||
|
typedef unsigned long int rune_t;
|
||||||
|
#endif
|
||||||
|
#include <stddef.h> /* size_t */
|
||||||
|
|
||||||
|
/* Reverses the order of the bytes in the 32-bit value c. */
|
||||||
|
rune_t swab32(rune_t c);
|
||||||
|
|
||||||
|
/* Returns the byte length of a valid UTF-8 rune. */
|
||||||
|
size_t utf8_size(rune_t c);
|
||||||
|
|
||||||
|
/* Returns the UTF-32BE codepoint of the UTF-8 rune c. */
|
||||||
|
rune_t utf8_to_utf32be(rune_t c);
|
||||||
|
|
||||||
|
/* Stores the UTF-8 rune c as bytes to the memory span s. s should point to a
|
||||||
|
* big enough memory span of chars in which to store c, a (possibly invalid)
|
||||||
|
* UTF-8 rune. Returns a pointer to the memory location after the last written
|
||||||
|
* byte. Returns NULL if n is not 0 and n is less than the number of bytes that
|
||||||
|
* will be written. */
|
||||||
|
char *utf8_to_chars(rune_t c, char *s, size_t n);
|
||||||
|
|
||||||
|
/* Returns the UTF-8 encoding of the UTF-32BE codepoint c. m is the minimum
|
||||||
|
* amount of bytes into which to encode the codepoint c. If m is greater than
|
||||||
|
* 0, this function may return overlong-encoded UTF-8. */
|
||||||
|
rune_t utf32be_to_utf8(rune_t c, size_t m);
|
||||||
|
|
||||||
|
/* Returns the UTF-32BE codepoint of the UTF-32LE codepoint c. */
|
||||||
|
rune_t utf32be_to_utf32le(rune_t c);
|
||||||
|
|
||||||
|
/* Returns the UTF-32LE codepoint of the UTF-32BE codepoint c. */
|
||||||
|
rune_t utf32le_to_utf32be(rune_t c);
|
145
src/libutf.c
Normal file
145
src/libutf.c
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
#include <stddef.h> /* size_t */
|
||||||
|
#include "libutf.h"
|
||||||
|
|
||||||
|
/* np is the pointer to the pointer to the next byte in a sequence. rp is the
|
||||||
|
* location to which the read UTF-8 rune will be stored. If np doesn't point to
|
||||||
|
* a valid UTF-8 rune, np and rp will be untouched and NULL will be returned.
|
||||||
|
*/
|
||||||
|
/* utf-8 bits |32 |24 |16 |8
|
||||||
|
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
|
||||||
|
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
|
||||||
|
rune_t *
|
||||||
|
chars_to_utf8(char **np, rune_t *rp){
|
||||||
|
size_t b; /* expected byte size of the rune at *np */
|
||||||
|
rune_t r;
|
||||||
|
// for();
|
||||||
|
return rp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* big-endian | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */
|
||||||
|
/* little-endian | TTTT SSSS | VVVV UUUU | XXXX WWWW | ZZZZ YYYY */
|
||||||
|
rune_t
|
||||||
|
swab32(rune_t c){ return
|
||||||
|
((c & 0xFF000000) >> 24)
|
||||||
|
| ((c & 0x00FF0000) >> 8)
|
||||||
|
| ((c & 0x0000FF00) << 8)
|
||||||
|
| ((c & 0x000000FF) << 24);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* From <https://en.wikipedia.org/wiki/UTF-8>, UTF-8 is encoded as follows:
|
||||||
|
* The codepoint 0bZYYYY_XXXXWWWW_VVVVUUUU is nestled within the literal bits 0
|
||||||
|
* and 1; letters denote nybbles here, little-endian, and certain bits are
|
||||||
|
* labeled, also little-endian. */
|
||||||
|
|
||||||
|
/* utf-8 bits |32 |24 |16
|
||||||
|
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
|
||||||
|
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
|
||||||
|
size_t
|
||||||
|
utf8_size(rune_t c){ return
|
||||||
|
((c & 0x80000000) >> 31) /* 4B? */
|
||||||
|
+ ((c & 0x00800000) >> 23) /* 3B? */
|
||||||
|
+ ((c & 0x00008000) >> 15) /* 2B? */
|
||||||
|
+ 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *
|
||||||
|
utf8_to_chars(rune_t c, char *s, size_t n){
|
||||||
|
size_t i;
|
||||||
|
for(i = 0; (c & 0xFF000000) == 0 && i < 4; ++i)
|
||||||
|
c <<= 8; /* remove leading zero bytes */
|
||||||
|
i = 4 - i; /* bytes in this rune */
|
||||||
|
if(n != 0 && i > n)
|
||||||
|
return NULL;
|
||||||
|
switch(4 - i){
|
||||||
|
case 4: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
|
||||||
|
case 3: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
|
||||||
|
case 2: *s++ = ((c & 0xFF000000) >> 24); c <<= 8;
|
||||||
|
case 1: *s++ = ((c & 0xFF000000) >> 24);
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* utf-8 bits |32 |27 |22 |14 |7
|
||||||
|
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
|
||||||
|
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
|
||||||
|
/* utf-32be bits |32 |21 |18 |12 |6
|
||||||
|
* U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
|
||||||
|
rune_t
|
||||||
|
utf8_to_utf32be(rune_t c){ return
|
||||||
|
((c & 0x07000000) >> 6) /* UTF-8 b27-25 -> UTF-32 b21-19 */
|
||||||
|
| (((c & 0x003F0000) /* UTF-8 b22-17 -> UTF-32 b18-13 */
|
||||||
|
^ ((c & (1 << 22)) >> 1)) >> 4) /* (if UTF-8 b23, zero b22) */
|
||||||
|
| ((c & 0x00003F00) >> 2) /* UTF-8 b14- 9 -> UTF-32 b12- 7 */
|
||||||
|
| (c & 0x0000007F); /* UTF-32 b 7- 1 */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* utf-32be bits |32 |21 |18 |16 |12 |6
|
||||||
|
* U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
|
||||||
|
/* utf-8 bits |32 |27 |22 |14 |7
|
||||||
|
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
|
||||||
|
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
|
||||||
|
rune_t
|
||||||
|
utf32be_to_utf8(rune_t c, size_t m){
|
||||||
|
size_t n;
|
||||||
|
rune_t r;
|
||||||
|
|
||||||
|
if((n = utf8_size(c)) > m)
|
||||||
|
m = n;
|
||||||
|
|
||||||
|
/* In tested compilers this generates roughly the same assembly as the
|
||||||
|
* naive (no fallthroughs) approach. */
|
||||||
|
switch(m){
|
||||||
|
case 4:
|
||||||
|
r = 0xF0000000 /* UTF-8 b32-29 */
|
||||||
|
| ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */
|
||||||
|
|
||||||
|
r |= 0x00800000 /* UTF-8 b24-23 */
|
||||||
|
| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
|
||||||
|
|
||||||
|
if(0) /* if(m == 3) */
|
||||||
|
case 3:
|
||||||
|
r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */
|
||||||
|
| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
|
||||||
|
|
||||||
|
r |= 0x00008000 /* UTF-8 b16-15 */
|
||||||
|
| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
|
||||||
|
|
||||||
|
if(0) /* if(m == 2) */
|
||||||
|
case 2:
|
||||||
|
r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */
|
||||||
|
| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
|
||||||
|
|
||||||
|
r |= 0x00000080 /* UTF-8 b 8- 7 */
|
||||||
|
| (c & 0x00003F); /* UTF-8 b 6- 1 */
|
||||||
|
|
||||||
|
break; /* if(m == 1) */
|
||||||
|
case 1:
|
||||||
|
r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* <https://www.herongyang.com/Unicode/UTF-32-UTF-32-Encoding.html> */
|
||||||
|
/* utf-32be bits | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
|
||||||
|
/* utf-32le bits | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */
|
||||||
|
rune_t
|
||||||
|
utf32be_to_utf32le(rune_t c){ return
|
||||||
|
((c & 0x000000FF) << 24)
|
||||||
|
| ((c & 0x0000FF00) << 8)
|
||||||
|
| ((c & 0x001F0000) >> 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
rune_t
|
||||||
|
utf32le_to_utf32be(rune_t c){ return
|
||||||
|
((c & 0xFF000000) >> 24)
|
||||||
|
| ((c & 0x00FF0000) >> 8)
|
||||||
|
| ((c & 0x00001F00) << 8);
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user