libutf: clean up comments

This commit is contained in:
dtb 2024-05-29 19:56:41 -06:00
parent 146ea609b6
commit 55fdca9123
Signed by: trinity
GPG Key ID: 6CDF230C17CC3349
2 changed files with 24 additions and 15 deletions

View File

@ -9,9 +9,29 @@ typedef unsigned long int rune_t;
#endif #endif
#include <stddef.h> /* size_t */ #include <stddef.h> /* size_t */
/* Reverses the order of the bytes in the 32-bit value c. */
rune_t swab32(rune_t c); rune_t swab32(rune_t c);
/* Returns the byte length of a valid UTF-8 rune. */
size_t utf8_size(rune_t c);
/* Returns the UTF-32BE codepoint of the UTF-8 rune c. */
rune_t utf8_to_utf32be(rune_t c); rune_t utf8_to_utf32be(rune_t c);
/* Stores the UTF-8 rune c as bytes to the memory span s. s should point to a
* big enough memory span of chars in which to store c, a (possibly invalid)
* UTF-8 rune. Returns a pointer to the memory location after the last written
* byte. Returns NULL if n is not 0 and n is less than the number of bytes that
* will be written. */
char *utf8_to_chars(rune_t c, char *s, size_t n); char *utf8_to_chars(rune_t c, char *s, size_t n);
/* Returns the UTF-8 encoding of the UTF-32BE codepoint c. m is the minimum
* amount of bytes into which to encode the codepoint c. If m is greater than
* 0, this function may return overlong-encoded UTF-8. */
rune_t utf32be_to_utf8(rune_t c, size_t m); rune_t utf32be_to_utf8(rune_t c, size_t m);
/* Returns the UTF-32BE codepoint of the UTF-32LE codepoint c. */
rune_t utf32be_to_utf32le(rune_t c); rune_t utf32be_to_utf32le(rune_t c);
/* Returns the UTF-32LE codepoint of the UTF-32BE codepoint c. */
rune_t utf32le_to_utf32be(rune_t c); rune_t utf32le_to_utf32be(rune_t c);

View File

@ -18,9 +18,6 @@ chars_to_utf8(char **np, rune_t *rp){
return rp; return rp;
} }
/* This is functionally equivalent to the UTF-32-specific conversion functions
* but very slightly slower than each. */
/* This operation is symmetrical; swab32(swab32(c)) will always return c. */
/* big-endian | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */ /* big-endian | ZZZZ YYYY | XXXX WWWW | VVVV UUUU | TTTT SSSS */
/* little-endian | TTTT SSSS | VVVV UUUU | XXXX WWWW | ZZZZ YYYY */ /* little-endian | TTTT SSSS | VVVV UUUU | XXXX WWWW | ZZZZ YYYY */
rune_t rune_t
@ -49,10 +46,6 @@ utf8_size(rune_t c){ return
+ ((c & 0x00000080) >> 7); /* 1B? */ + ((c & 0x00000080) >> 7); /* 1B? */
} }
/* s should point to a big enough memory span of chars in which to store c, a
* (possibly invalid) UTF-8 rune. Returns a pointer to the memory location
* after the last written byte. Returns NULL if n is not 0 and n is less than
* the number of bytes that will be written. */
char * char *
utf8_to_chars(rune_t c, char *s, size_t n){ utf8_to_chars(rune_t c, char *s, size_t n){
size_t i; size_t i;
@ -93,8 +86,6 @@ utf8_to_utf32be(rune_t c){ return
* U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU * * U+000800 to U+00FFFF | 0000 0000 | 1110 XXXX | 10WW WWVV | 10VV UUUU *
* U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU * * U+000080 to U+0007FF | 0000 0000 | 0000 0000 | 110W WWVV | 10VV UUUU *
* U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */ * U+000000 to U+00007F | 0000 0000 | 0000 0000 | 0000 0000 | 0VVV UUUU */
/* m is the minimum amount of bytes into which to encode the codepoint c. If m
* is greater than 0, this function may return overlong-encoded UTF-8. */
rune_t rune_t
utf32be_to_utf8(rune_t c, size_t m){ utf32be_to_utf8(rune_t c, size_t m){
rune_t r; rune_t r;
@ -122,18 +113,16 @@ utf32be_to_utf8(rune_t c, size_t m){
return r; return r;
} }
/* <https://www.herongyang.com/Unicode/UTF-32-UTF-32-Encoding.html> is a good /* <https://www.herongyang.com/Unicode/UTF-32-UTF-32-Encoding.html> */
* explanation of this. */ /* utf-32be bits | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
/* utf-32be bits /* utf-32le bits | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */
* U+000000 to U+10FFFF | 0000 0000 | 000Z YYYY | XXXX WWWW | VVVV UUUU */
/* utf-32le bits
* U+000000 to U+10FFFF | VVVV UUUU | XXXX WWWW | 000Z YYYY | 0000 0000 */
rune_t rune_t
utf32be_to_utf32le(rune_t c){ return utf32be_to_utf32le(rune_t c){ return
((c & 0x000000FF) << 24) ((c & 0x000000FF) << 24)
| ((c & 0x0000FF00) << 8) | ((c & 0x0000FF00) << 8)
| ((c & 0x001F0000) >> 8); | ((c & 0x001F0000) >> 8);
} }
rune_t rune_t
utf32le_to_utf32be(rune_t c){ return utf32le_to_utf32be(rune_t c){ return
((c & 0xFF000000) >> 24) ((c & 0xFF000000) >> 24)