WIP: utf(1) #141

Closed
trinity wants to merge 8 commits from code into main
Showing only changes of commit dc35c0142b - Show all commits

View File

@ -33,17 +33,17 @@ swab32(rune_t c){ return
* and 1; letters denote nybbles here, little-endian, and certain bits are * and 1; letters denote nybbles here, little-endian, and certain bits are
* labeled, also little-endian. */ * labeled, also little-endian. */
/* utf-8 bits |32 |24 |16 |8 /* utf-8 bits |32 |24 |16
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
size_t size_t
utf8_size(rune_t c){ return utf8_size(rune_t c){ return
1 + ((c & 0x80000000) >> 31) /* 4B? */ ((c & 0x80000000) >> 31) /* 4B? */
+ ((c & 0x00800000) >> 23) /* 3B? */ + ((c & 0x00800000) >> 23) /* 3B? */
+ ((c & 0x00008000) >> 15) /* 2B? */ + ((c & 0x00008000) >> 15) /* 2B? */
+ ((c & 0x00000080) >> 7); /* 1B? */ + 1;
} }
char * char *
@ -88,27 +88,41 @@ utf8_to_utf32be(rune_t c){ return
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
rune_t rune_t
utf32be_to_utf8(rune_t c, size_t m){ utf32be_to_utf8(rune_t c, size_t m){
size_t n;
rune_t r; rune_t r;
{ size_t n;
if((n = utf8_size(c)) > m) if((n = utf8_size(c)) > m)
m = n; } /* This avoids calculating the size twice. */ m = n;
switch(m){ /* "Trin's device" if this is a novel use of a switch. */
case 4: r = 0xF0000000 /* UTF-8 b32-29 */ /* In tested compilers this generates roughly the same assembly as the
* naive (no fallthroughs) approach. */
switch(m){
case 4:
r = 0xF0000000 /* UTF-8 b32-29 */
| ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */ | ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */
| 0x00800000 /* UTF-8 b24-23 */
r |= 0x00800000 /* UTF-8 b24-23 */
| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */ | ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
if(m == 3)
case 3: r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */ if(0) /* if(m == 3) */
case 3:
r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */
| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */ | ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
r |= 0x00008000 /* UTF-8 b16-15 */ r |= 0x00008000 /* UTF-8 b16-15 */
| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */ | ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
if(m == 2)
case 2: r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */ if(0) /* if(m == 2) */
case 2:
r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */
| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */ | ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
r |= 0x00000080 /* UTF-8 b 8- 7 */ r |= 0x00000080 /* UTF-8 b 8- 7 */
| (c & 0x00003F); /* UTF-8 b 6- 1 */ | (c & 0x00003F); /* UTF-8 b 6- 1 */
if(m == 1)
case 1: r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */ break; /* if(m == 1) */
case 1:
r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */
} }
return r; return r;
} }