1
0

split out utf8 encoding into library

This commit is contained in:
dtb 2023-09-04 08:47:52 -04:00
parent 3b3cd42e69
commit 0697a13fcd
4 changed files with 58 additions and 27 deletions

View File

@ -1,2 +1,12 @@
utf8: utf8.c
$(CC) -I../ascii -g -o utf8 utf8.c
utf8: utf8.o libunicode.o
$(CC) -g -o utf8 libunicode.o utf8.o
libunicode.o:
utf8.o: libunicode.h utf8.c
$(CC) -I../ascii -c -g -o utf8.o utf8.c
clean:
rm -f *.o utf8
.PHONY: clean

21
unicode/libunicode.c Normal file
View File

@ -0,0 +1,21 @@
#include "libunicode.h"
utf8_t utf8(utf32_t c){
unsigned char n;
utf8_t r;
r = 0;
switch(n = (c < 0x10000) + (c < 0x0800) + (c < 0x0080)){
case 0: r = 0xF0 + ((c >> 18) & 0x07); /* 11110xxx */
case 1: r = (r << 8) + (n == 1
? 0xE0 + ((c >> 12) & 0x0F) /* 1110xxxx */
: 0x50 + ((c >> 12) & 0x3F)); /* 10xxxxxx */
case 2: r = (r << 8) + (n == 2
? 0xC0 + ((c >> 6) & 0x1F) /* 110xxxxx */
: 0x50 + ((c >> 6) & 0x3F)); /* 10xxxxxx */
case 3: r = (r << 8) + (n == 3
? c & 0x7F /* 0xxxxxxx */
: 0x50 + (c & 0x3F)); /* 10xxxxxx */
}
return r;
}

14
unicode/libunicode.h Normal file
View File

@ -0,0 +1,14 @@
#if defined UINT32_MAX /* indicator <stdint.h> is included */
# define unicode_codepoint_t uint32_t
#else
/* C99 draft 5.2.4.2.1 Sizes of integer types says unsigned long must be able
* to hold 32b */
# define unicode_codepoint_t unsigned long
#endif
#define utf32_t unicode_codepoint_t
/* for holding the literal numeric value of a utf8 rune, not for assembling
* strings; >=32b value */
#define utf8_t unicode_codepoint_t
/* encode UTF-32 value into UTF-8 */
utf8_t utf8(utf32_t c);

View File

@ -1,7 +1,8 @@
#include <stdio.h> /* fprintf(3), getc(3), stderr, stdin, EOF */
#include <string.h> /* memset(3) */
#include <stdio.h> /* fprintf(3), getc(3), putc(3) stderr, stdin, EOF */
#include <string.h> /* strchr(3) */
#include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER,
* ASCII_HEXADECIMAL_DIGITS_UPPER */
#include "libunicode.h" /* utf8(3) */
#define SKIPLINE while((c = getc(stdin)) != '\n' && c != EOF)
static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER
@ -12,16 +13,15 @@ int main(int argc, char *argv[]){
int i;
int l; /* line counter */
char *n;
long unsigned int utf32_lit;
unsigned char utf8_bytes[8];
utf32_t codepoint;
utf8_t encoded;
c = '\0';
i = 0;
l = 1;
while(c != EOF){
memset(utf8_bytes, '0', sizeof utf8_bytes);
for(
i = 0, n = NULL, utf32_lit = 0;
i = 0, n = NULL, codepoint = 0;
(c = getc(stdin)) != '\n'
&& c != EOF
&& i < 10;
@ -41,7 +41,7 @@ int main(int argc, char *argv[]){
break;
}
if(n != NULL)
utf32_lit = (utf32_lit << 4) + (n - hex) % 16;
codepoint = (codepoint << 4) + (n - hex) % 16;
}
if(i < 3){
if(c != '\n' && c != EOF)
@ -50,24 +50,10 @@ int main(int argc, char *argv[]){
return 0;
continue;
}
/* something with the bit math is broken */
switch(i = (utf32_lit < 0x10000)
+ (utf32_lit < 0x0800)
+ (utf32_lit < 0x0080)){
case 0: utf8_bytes[0] =
0xF0 + ((utf32_lit >> 18) & 0x07); /* 11110xxx */
case 1: utf8_bytes[1] = i == 1
? 0xE0 + ((utf32_lit >> 12) & 0x0F) /* 1110xxxx */
: 0x50 + ((utf32_lit >> 12) & 0x3F); /* 10xxxxxx */
case 2: utf8_bytes[2] = i == 2
? 0xC0 + ((utf32_lit >> 6) & 0x1F) /* 110xxxxx */
: 0x50 + ((utf32_lit >> 6) & 0x3F); /* 10xxxxxx */
case 3: utf8_bytes[3] = i == 3
? utf8_bytes[3] = utf32_lit & 0x7F /* 0xxxxxxx */
: 0x50 + (utf32_lit & 0x3F); /* 10xxxxxx */
}
for( ; i < 4; ++i)
putc(utf8_bytes[i], stdout);
encoded = utf8(codepoint);
for(i = 3; i >= 0; --i)
if((encoded >> 8 * i) > 0 || i == 0)
putc(encoded >> 8 * i, stdout);
++l;
}
}