libutf: utf8_size: fix off-by-one for >1 retvals, utf32be_to_utf8: tweak to make more readable
This commit is contained in:
parent
dc4091b43f
commit
dc35c0142b
64
src/libutf.c
64
src/libutf.c
@ -33,17 +33,17 @@ swab32(rune_t c){ return
|
||||
* and 1; letters denote nybbles here, little-endian, and certain bits are
|
||||
* labeled, also little-endian. */
|
||||
|
||||
/* utf-8 bits |32 |24 |16 |8
|
||||
/* utf-8 bits |32 |24 |16
|
||||
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
|
||||
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
|
||||
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
|
||||
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
|
||||
size_t
|
||||
utf8_size(rune_t c){ return
|
||||
1 + ((c & 0x80000000) >> 31) /* 4B? */
|
||||
+ ((c & 0x00800000) >> 23) /* 3B? */
|
||||
+ ((c & 0x00008000) >> 15) /* 2B? */
|
||||
+ ((c & 0x00000080) >> 7); /* 1B? */
|
||||
((c & 0x80000000) >> 31) /* 4B? */
|
||||
+ ((c & 0x00800000) >> 23) /* 3B? */
|
||||
+ ((c & 0x00008000) >> 15) /* 2B? */
|
||||
+ 1;
|
||||
}
|
||||
|
||||
char *
|
||||
@ -88,27 +88,41 @@ utf8_to_utf32be(rune_t c){ return
|
||||
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
|
||||
rune_t
|
||||
utf32be_to_utf8(rune_t c, size_t m){
|
||||
size_t n;
|
||||
rune_t r;
|
||||
{ size_t n;
|
||||
if((n = utf8_size(c)) > m)
|
||||
m = n; } /* This avoids calculating the size twice. */
|
||||
switch(m){ /* "Trin's device" if this is a novel use of a switch. */
|
||||
case 4: r = 0xF0000000 /* UTF-8 b32-29 */
|
||||
| ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */
|
||||
| 0x00800000 /* UTF-8 b24-23 */
|
||||
| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
|
||||
if(m == 3)
|
||||
case 3: r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */
|
||||
| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
|
||||
r |= 0x00008000 /* UTF-8 b16-15 */
|
||||
| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
|
||||
if(m == 2)
|
||||
case 2: r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */
|
||||
| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
|
||||
r |= 0x00000080 /* UTF-8 b 8- 7 */
|
||||
| (c & 0x00003F); /* UTF-8 b 6- 1 */
|
||||
if(m == 1)
|
||||
case 1: r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */
|
||||
|
||||
if((n = utf8_size(c)) > m)
|
||||
m = n;
|
||||
|
||||
/* In tested compilers this generates roughly the same assembly as the
|
||||
* naive (no fallthroughs) approach. */
|
||||
switch(m){
|
||||
case 4:
|
||||
r = 0xF0000000 /* UTF-8 b32-29 */
|
||||
| ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */
|
||||
|
||||
r |= 0x00800000 /* UTF-8 b24-23 */
|
||||
| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
|
||||
|
||||
if(0) /* if(m == 3) */
|
||||
case 3:
|
||||
r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */
|
||||
| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
|
||||
|
||||
r |= 0x00008000 /* UTF-8 b16-15 */
|
||||
| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
|
||||
|
||||
if(0) /* if(m == 2) */
|
||||
case 2:
|
||||
r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */
|
||||
| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
|
||||
|
||||
r |= 0x00000080 /* UTF-8 b 8- 7 */
|
||||
| (c & 0x00003F); /* UTF-8 b 6- 1 */
|
||||
|
||||
break; /* if(m == 1) */
|
||||
case 1:
|
||||
r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user