diff --git a/src/libutf.c b/src/libutf.c index ab2bbb6..eec8f69 100644 --- a/src/libutf.c +++ b/src/libutf.c @@ -33,17 +33,17 @@ swab32(rune_t c){ return * and 1; letters denote nybbles here, little-endian, and certain bits are * labeled, also little-endian. */ -/* utf-8 bits |32 |24 |16 |8 +/* utf-8 bits |32 |24 |16 * U+001000 to U+10FFFF | 1111 0ZYY | 10YY XXXX | 10WW WWVV | 10VV UUUU * * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ size_t utf8_size(rune_t c){ return - 1 + ((c & 0x80000000) >> 31) /* 4B? */ - + ((c & 0x00800000) >> 23) /* 3B? */ - + ((c & 0x00008000) >> 15) /* 2B? */ - + ((c & 0x00000080) >> 7); /* 1B? */ + ((c & 0x80000000) >> 31) /* 4B? */ + + ((c & 0x00800000) >> 23) /* 3B? */ + + ((c & 0x00008000) >> 15) /* 2B? */ + + 1; } char * @@ -88,27 +88,41 @@ utf8_to_utf32be(rune_t c){ return * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ rune_t utf32be_to_utf8(rune_t c, size_t m){ + size_t n; rune_t r; - { size_t n; - if((n = utf8_size(c)) > m) - m = n; } /* This avoids calculating the size twice. */ - switch(m){ /* "Trin's device" if this is a novel use of a switch. */ - case 4: r = 0xF0000000 /* UTF-8 b32-29 */ - | ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */ - | 0x00800000 /* UTF-8 b24-23 */ - | ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */ - if(m == 3) - case 3: r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */ - | ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */ - r |= 0x00008000 /* UTF-8 b16-15 */ - | ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */ - if(m == 2) - case 2: r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */ - | ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */ - r |= 0x00000080 /* UTF-8 b 8- 7 */ - | (c & 0x00003F); /* UTF-8 b 6- 1 */ - if(m == 1) - case 1: r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */ + + if((n = utf8_size(c)) > m) + m = n; + + /* In tested compilers this generates roughly the same assembly as the + * naive (no fallthroughs) approach. */ + switch(m){ + case 4: + r = 0xF0000000 /* UTF-8 b32-29 */ + | ((c & 0x1C0000) << 6) /* UTF-32 b21-19 -> UTF-8 b27-25 */ + + r |= 0x00800000 /* UTF-8 b24-23 */ + | ((c & 0x03F000) << 4); /* UTF-32 b18-13 -> UTF-8 b24-17 */ + + if(0) /* if(m == 3) */ + case 3: + r = 0x00E00000 /* 0xE == 0b1110 */ /* UTF-8 b24-21 */ + | ((c & 0x00F000) << 4); /* UTF-32 b16-13 -> UTF-8 b20-17 */ + + r |= 0x00008000 /* UTF-8 b16-15 */ + | ((c & 0x000FC0) << 2); /* UTF-32 b12- 7 -> UTF-8 b14- 9 */ + + if(0) /* if(m == 2) */ + case 2: + r = 0x0000C000 /* 0xC == 0b1100 */ /* UTF-8 b16-14 */ + | ((c & 0x007C00) << 2); /* UTF-32 b11- 7 -> UTF-8 b13- 9 */ + + r |= 0x00000080 /* UTF-8 b 8- 7 */ + | (c & 0x00003F); /* UTF-8 b 6- 1 */ + + break; /* if(m == 1) */ + case 1: + r = c & 0x00007F; /* 0x7 == 0b0111 */ /* UTF-8 b 7- 1 */ } return r; }