split out utf8 encoding into library
This commit is contained in:
parent
3b3cd42e69
commit
0697a13fcd
@ -1,2 +1,12 @@
|
||||
utf8: utf8.c
|
||||
$(CC) -I../ascii -g -o utf8 utf8.c
|
||||
utf8: utf8.o libunicode.o
|
||||
$(CC) -g -o utf8 libunicode.o utf8.o
|
||||
|
||||
libunicode.o:
|
||||
|
||||
utf8.o: libunicode.h utf8.c
|
||||
$(CC) -I../ascii -c -g -o utf8.o utf8.c
|
||||
|
||||
clean:
|
||||
rm -f *.o utf8
|
||||
|
||||
.PHONY: clean
|
||||
|
21
unicode/libunicode.c
Normal file
21
unicode/libunicode.c
Normal file
@ -0,0 +1,21 @@
|
||||
#include "libunicode.h"
|
||||
|
||||
utf8_t utf8(utf32_t c){
|
||||
unsigned char n;
|
||||
utf8_t r;
|
||||
|
||||
r = 0;
|
||||
switch(n = (c < 0x10000) + (c < 0x0800) + (c < 0x0080)){
|
||||
case 0: r = 0xF0 + ((c >> 18) & 0x07); /* 11110xxx */
|
||||
case 1: r = (r << 8) + (n == 1
|
||||
? 0xE0 + ((c >> 12) & 0x0F) /* 1110xxxx */
|
||||
: 0x50 + ((c >> 12) & 0x3F)); /* 10xxxxxx */
|
||||
case 2: r = (r << 8) + (n == 2
|
||||
? 0xC0 + ((c >> 6) & 0x1F) /* 110xxxxx */
|
||||
: 0x50 + ((c >> 6) & 0x3F)); /* 10xxxxxx */
|
||||
case 3: r = (r << 8) + (n == 3
|
||||
? c & 0x7F /* 0xxxxxxx */
|
||||
: 0x50 + (c & 0x3F)); /* 10xxxxxx */
|
||||
}
|
||||
return r;
|
||||
}
|
14
unicode/libunicode.h
Normal file
14
unicode/libunicode.h
Normal file
@ -0,0 +1,14 @@
|
||||
#if defined UINT32_MAX /* indicator <stdint.h> is included */
|
||||
# define unicode_codepoint_t uint32_t
|
||||
#else
|
||||
/* C99 draft 5.2.4.2.1 Sizes of integer types says unsigned long must be able
|
||||
* to hold 32b */
|
||||
# define unicode_codepoint_t unsigned long
|
||||
#endif
|
||||
#define utf32_t unicode_codepoint_t
|
||||
/* for holding the literal numeric value of a utf8 rune, not for assembling
|
||||
* strings; >=32b value */
|
||||
#define utf8_t unicode_codepoint_t
|
||||
|
||||
/* encode UTF-32 value into UTF-8 */
|
||||
utf8_t utf8(utf32_t c);
|
@ -1,7 +1,8 @@
|
||||
#include <stdio.h> /* fprintf(3), getc(3), stderr, stdin, EOF */
|
||||
#include <string.h> /* memset(3) */
|
||||
#include <stdio.h> /* fprintf(3), getc(3), putc(3) stderr, stdin, EOF */
|
||||
#include <string.h> /* strchr(3) */
|
||||
#include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER,
|
||||
* ASCII_HEXADECIMAL_DIGITS_UPPER */
|
||||
#include "libunicode.h" /* utf8(3) */
|
||||
#define SKIPLINE while((c = getc(stdin)) != '\n' && c != EOF)
|
||||
|
||||
static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER
|
||||
@ -12,16 +13,15 @@ int main(int argc, char *argv[]){
|
||||
int i;
|
||||
int l; /* line counter */
|
||||
char *n;
|
||||
long unsigned int utf32_lit;
|
||||
unsigned char utf8_bytes[8];
|
||||
utf32_t codepoint;
|
||||
utf8_t encoded;
|
||||
|
||||
c = '\0';
|
||||
i = 0;
|
||||
l = 1;
|
||||
while(c != EOF){
|
||||
memset(utf8_bytes, '0', sizeof utf8_bytes);
|
||||
for(
|
||||
i = 0, n = NULL, utf32_lit = 0;
|
||||
i = 0, n = NULL, codepoint = 0;
|
||||
(c = getc(stdin)) != '\n'
|
||||
&& c != EOF
|
||||
&& i < 10;
|
||||
@ -41,7 +41,7 @@ int main(int argc, char *argv[]){
|
||||
break;
|
||||
}
|
||||
if(n != NULL)
|
||||
utf32_lit = (utf32_lit << 4) + (n - hex) % 16;
|
||||
codepoint = (codepoint << 4) + (n - hex) % 16;
|
||||
}
|
||||
if(i < 3){
|
||||
if(c != '\n' && c != EOF)
|
||||
@ -50,24 +50,10 @@ int main(int argc, char *argv[]){
|
||||
return 0;
|
||||
continue;
|
||||
}
|
||||
/* something with the bit math is broken */
|
||||
switch(i = (utf32_lit < 0x10000)
|
||||
+ (utf32_lit < 0x0800)
|
||||
+ (utf32_lit < 0x0080)){
|
||||
case 0: utf8_bytes[0] =
|
||||
0xF0 + ((utf32_lit >> 18) & 0x07); /* 11110xxx */
|
||||
case 1: utf8_bytes[1] = i == 1
|
||||
? 0xE0 + ((utf32_lit >> 12) & 0x0F) /* 1110xxxx */
|
||||
: 0x50 + ((utf32_lit >> 12) & 0x3F); /* 10xxxxxx */
|
||||
case 2: utf8_bytes[2] = i == 2
|
||||
? 0xC0 + ((utf32_lit >> 6) & 0x1F) /* 110xxxxx */
|
||||
: 0x50 + ((utf32_lit >> 6) & 0x3F); /* 10xxxxxx */
|
||||
case 3: utf8_bytes[3] = i == 3
|
||||
? utf8_bytes[3] = utf32_lit & 0x7F /* 0xxxxxxx */
|
||||
: 0x50 + (utf32_lit & 0x3F); /* 10xxxxxx */
|
||||
}
|
||||
for( ; i < 4; ++i)
|
||||
putc(utf8_bytes[i], stdout);
|
||||
encoded = utf8(codepoint);
|
||||
for(i = 3; i >= 0; --i)
|
||||
if((encoded >> 8 * i) > 0 || i == 0)
|
||||
putc(encoded >> 8 * i, stdout);
|
||||
++l;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user