libutf: utf8_size: fix off-by-one for >1 retvals, utf32be_to_utf8: tweak to make more readable

This commit is contained in:
dtb 2024-05-31 09:39:26 -06:00
parent dc4091b43f
commit dc35c0142b
Signed by: trinity
GPG Key ID: 6CDF230C17CC3349

View File

@ -33,17 +33,17 @@ swab32(rune_t c){ return
* and 1; letters denote nybbles here, little-endian, and certain bits are
* labeled, also little-endian. */
/* utf-8 bits |32 |24 |16 |8
/* utf-8 bits |32 |24 |16
* U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU *
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
size_t
utf8_size(rune_t c){ return
1 + ((c & 0x80000000) >> 31) /* 4B? */
+ ((c & 0x00800000) >> 23) /* 3B? */
+ ((c & 0x00008000) >> 15) /* 2B? */
+ ((c & 0x00000080) >> 7); /* 1B? */
((c & 0x80000000) >> 31) /* 4B? */
+ ((c & 0x00800000) >> 23) /* 3B? */
+ ((c & 0x00008000) >> 15) /* 2B? */
+ 1;
}
char *
@ -88,27 +88,41 @@ utf8_to_utf32be(rune_t c){ return
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
rune_t
utf32be_to_utf8(rune_t c, size_t m){
size_t n;
rune_t r;
{ size_t n;
if((n = utf8_size(c)) > m)
m = n; } /* This avoids calculating the size twice. */
switch(m){ /* "Trin's device" if this is a novel use of a switch. */
case 4: r = 0xF0000000 /* UTF-8 b32-29 */
| ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */
| 0x00800000 /* UTF-8 b24-23 */
| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
if(m == 3)
case 3: r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */
| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
r |= 0x00008000 /* UTF-8 b16-15 */
| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
if(m == 2)
case 2: r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */
| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
r |= 0x00000080 /* UTF-8 b 8- 7 */
| (c & 0x00003F); /* UTF-8 b 6- 1 */
if(m == 1)
case 1: r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */
if((n = utf8_size(c)) > m)
m = n;
/* In tested compilers this generates roughly the same assembly as the
* naive (no fallthroughs) approach. */
switch(m){
case 4:
r = 0xF0000000 /* UTF-8 b32-29 */
| ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */
r |= 0x00800000 /* UTF-8 b24-23 */
| ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */
if(0) /* if(m == 3) */
case 3:
r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */
| ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */
r |= 0x00008000 /* UTF-8 b16-15 */
| ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */
if(0) /* if(m == 2) */
case 2:
r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */
| ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */
r |= 0x00000080 /* UTF-8 b 8- 7 */
| (c & 0x00003F); /* UTF-8 b 6- 1 */
break; /* if(m == 1) */
case 1:
r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */
}
return r;
}