libutf: UTF conversion functionality
This commit is contained in:
parent
9093b06166
commit
2479ab63d0
55
src/libenc/libutf.c
Normal file
55
src/libenc/libutf.c
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
#include <stdlib.h> /* size_t */
|
||||||
|
#include "libutf.h"
|
||||||
|
|
||||||
|
/* UTF-32BE is the big-endian literal encoding of a Unicode codepoint,
|
||||||
|
* including 11 bits of padding. The following functions convert from and to
|
||||||
|
* UTF-32. */
|
||||||
|
|
||||||
|
/* From <https://en.wikipedia.org/wiki/UTF-8>, UTF-8 is encoded as follows:
|
||||||
|
* The codepoint 0bZYYYY_XXXXWWWW_VVVVUUUU is nestled within the literal bits 0
|
||||||
|
* and 1; letters denote nybbles here, little-endian, and certain bits are
|
||||||
|
* labeled, also little-endian. */
|
||||||
|
/* utf-8 bits |32 |27 |22 |14 |7
|
||||||
|
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
|
||||||
|
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
|
||||||
|
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
|
||||||
|
/* utf-32be bits |32 |21 |18 |12 |6
|
||||||
|
* U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
|
||||||
|
codepoint_t
|
||||||
|
utf8_to_utf32be(rune_t c){ return
|
||||||
|
((c & 0x07000000) >> 6) /* UTF-8 b27-25 -> UTF-32 b21-19 */
|
||||||
|
| (((c & 0x003F0000) /* UTF-8 b22-17 -> UTF-32 b18-13 */
|
||||||
|
^ ((c & (1 << 22)) >> 1)) >> 4) /* (if UTF-8 b23, zero b22) */
|
||||||
|
| ((c & 0x00003F00) >> 2) /* UTF-8 b14- 9 -> UTF-32 b12- 7 */
|
||||||
|
| (c & 0x0000007F); /* UTF-32 b 7- 1 */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* <https://www.herongyang.com/Unicode/UTF-32-UTF-32-Encoding.html> is a good
|
||||||
|
* explanation of this. */
|
||||||
|
/* utf-32be bits
|
||||||
|
* U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
|
||||||
|
/* utf-32le bits
|
||||||
|
* U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */
|
||||||
|
codepoint_t
|
||||||
|
utf32be_to_utf32le(codepoint_t c){ return
|
||||||
|
((c & 0x000000FF) << 24)
|
||||||
|
| ((c & 0x0000FF00) << 8)
|
||||||
|
| ((c & 0x001F0000) >> 8);
|
||||||
|
}
|
||||||
|
codepoint_t
|
||||||
|
utf32le_to_utf32be(codepoint_t c){ return
|
||||||
|
((c & 0xFF000000) >> 24)
|
||||||
|
| ((c & 0x00FF0000) >> 8)
|
||||||
|
| ((c & 0x00001F00) << 8);
|
||||||
|
}
|
||||||
|
/* This operation is symmetrical; swab32(swab32(c)) will always return c. It's
|
||||||
|
* (very slightly) slower than the specific UTF-32 conversion functions but may
|
||||||
|
* be useful. */
|
||||||
|
codepoint_t
|
||||||
|
swab32(codepoint_t c){ return
|
||||||
|
((c & 0xFF000000) >> 24)
|
||||||
|
| ((c & 0x00FF0000) >> 8)
|
||||||
|
| ((c & 0x0000FF00) << 8)
|
||||||
|
| ((c & 0x000000FF) << 24);
|
||||||
|
}
|
14
src/libenc/libutf.h
Normal file
14
src/libenc/libutf.h
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
#if __STDC_VERSION__ >= 199901L
|
||||||
|
/* C99 type definitions */
|
||||||
|
# include <stdint.h>
|
||||||
|
typedef uint32_t rune_t;
|
||||||
|
#else
|
||||||
|
/* Must hold at least 32b; see the C89 draft 2.2.4.2
|
||||||
|
* <http://jfxpt.com/library/c89-draft.html#2.2.4.2> */
|
||||||
|
typedef unsigned long int rune_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
rune_t swab32(rune_t c);
|
||||||
|
rune_t utf8_to_utf32be(rune_t c);
|
||||||
|
rune_t utf32be_to_utf32le(rune_t c);
|
||||||
|
rune_t utf32le_to_utf32be(rune_t c);
|
Loading…
Reference in New Issue
Block a user