split out utf8 encoding into library
This commit is contained in:
parent
3b3cd42e69
commit
0697a13fcd
@ -1,2 +1,12 @@
|
|||||||
utf8: utf8.c
|
utf8: utf8.o libunicode.o
|
||||||
$(CC) -I../ascii -g -o utf8 utf8.c
|
$(CC) -g -o utf8 libunicode.o utf8.o
|
||||||
|
|
||||||
|
libunicode.o:
|
||||||
|
|
||||||
|
utf8.o: libunicode.h utf8.c
|
||||||
|
$(CC) -I../ascii -c -g -o utf8.o utf8.c
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f *.o utf8
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
21
unicode/libunicode.c
Normal file
21
unicode/libunicode.c
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
#include "libunicode.h"
|
||||||
|
|
||||||
|
utf8_t utf8(utf32_t c){
|
||||||
|
unsigned char n;
|
||||||
|
utf8_t r;
|
||||||
|
|
||||||
|
r = 0;
|
||||||
|
switch(n = (c < 0x10000) + (c < 0x0800) + (c < 0x0080)){
|
||||||
|
case 0: r = 0xF0 + ((c >> 18) & 0x07); /* 11110xxx */
|
||||||
|
case 1: r = (r << 8) + (n == 1
|
||||||
|
? 0xE0 + ((c >> 12) & 0x0F) /* 1110xxxx */
|
||||||
|
: 0x50 + ((c >> 12) & 0x3F)); /* 10xxxxxx */
|
||||||
|
case 2: r = (r << 8) + (n == 2
|
||||||
|
? 0xC0 + ((c >> 6) & 0x1F) /* 110xxxxx */
|
||||||
|
: 0x50 + ((c >> 6) & 0x3F)); /* 10xxxxxx */
|
||||||
|
case 3: r = (r << 8) + (n == 3
|
||||||
|
? c & 0x7F /* 0xxxxxxx */
|
||||||
|
: 0x50 + (c & 0x3F)); /* 10xxxxxx */
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
14
unicode/libunicode.h
Normal file
14
unicode/libunicode.h
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
#if defined UINT32_MAX /* indicator <stdint.h> is included */
|
||||||
|
# define unicode_codepoint_t uint32_t
|
||||||
|
#else
|
||||||
|
/* C99 draft 5.2.4.2.1 Sizes of integer types says unsigned long must be able
|
||||||
|
* to hold 32b */
|
||||||
|
# define unicode_codepoint_t unsigned long
|
||||||
|
#endif
|
||||||
|
#define utf32_t unicode_codepoint_t
|
||||||
|
/* for holding the literal numeric value of a utf8 rune, not for assembling
|
||||||
|
* strings; >=32b value */
|
||||||
|
#define utf8_t unicode_codepoint_t
|
||||||
|
|
||||||
|
/* encode UTF-32 value into UTF-8 */
|
||||||
|
utf8_t utf8(utf32_t c);
|
@ -1,7 +1,8 @@
|
|||||||
#include <stdio.h> /* fprintf(3), getc(3), stderr, stdin, EOF */
|
#include <stdio.h> /* fprintf(3), getc(3), putc(3) stderr, stdin, EOF */
|
||||||
#include <string.h> /* memset(3) */
|
#include <string.h> /* strchr(3) */
|
||||||
#include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER,
|
#include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER,
|
||||||
* ASCII_HEXADECIMAL_DIGITS_UPPER */
|
* ASCII_HEXADECIMAL_DIGITS_UPPER */
|
||||||
|
#include "libunicode.h" /* utf8(3) */
|
||||||
#define SKIPLINE while((c = getc(stdin)) != '\n' && c != EOF)
|
#define SKIPLINE while((c = getc(stdin)) != '\n' && c != EOF)
|
||||||
|
|
||||||
static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER
|
static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER
|
||||||
@ -12,16 +13,15 @@ int main(int argc, char *argv[]){
|
|||||||
int i;
|
int i;
|
||||||
int l; /* line counter */
|
int l; /* line counter */
|
||||||
char *n;
|
char *n;
|
||||||
long unsigned int utf32_lit;
|
utf32_t codepoint;
|
||||||
unsigned char utf8_bytes[8];
|
utf8_t encoded;
|
||||||
|
|
||||||
c = '\0';
|
c = '\0';
|
||||||
i = 0;
|
i = 0;
|
||||||
l = 1;
|
l = 1;
|
||||||
while(c != EOF){
|
while(c != EOF){
|
||||||
memset(utf8_bytes, '0', sizeof utf8_bytes);
|
|
||||||
for(
|
for(
|
||||||
i = 0, n = NULL, utf32_lit = 0;
|
i = 0, n = NULL, codepoint = 0;
|
||||||
(c = getc(stdin)) != '\n'
|
(c = getc(stdin)) != '\n'
|
||||||
&& c != EOF
|
&& c != EOF
|
||||||
&& i < 10;
|
&& i < 10;
|
||||||
@ -41,7 +41,7 @@ int main(int argc, char *argv[]){
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if(n != NULL)
|
if(n != NULL)
|
||||||
utf32_lit = (utf32_lit << 4) + (n - hex) % 16;
|
codepoint = (codepoint << 4) + (n - hex) % 16;
|
||||||
}
|
}
|
||||||
if(i < 3){
|
if(i < 3){
|
||||||
if(c != '\n' && c != EOF)
|
if(c != '\n' && c != EOF)
|
||||||
@ -50,24 +50,10 @@ int main(int argc, char *argv[]){
|
|||||||
return 0;
|
return 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* something with the bit math is broken */
|
encoded = utf8(codepoint);
|
||||||
switch(i = (utf32_lit < 0x10000)
|
for(i = 3; i >= 0; --i)
|
||||||
+ (utf32_lit < 0x0800)
|
if((encoded >> 8 * i) > 0 || i == 0)
|
||||||
+ (utf32_lit < 0x0080)){
|
putc(encoded >> 8 * i, stdout);
|
||||||
case 0: utf8_bytes[0] =
|
|
||||||
0xF0 + ((utf32_lit >> 18) & 0x07); /* 11110xxx */
|
|
||||||
case 1: utf8_bytes[1] = i == 1
|
|
||||||
? 0xE0 + ((utf32_lit >> 12) & 0x0F) /* 1110xxxx */
|
|
||||||
: 0x50 + ((utf32_lit >> 12) & 0x3F); /* 10xxxxxx */
|
|
||||||
case 2: utf8_bytes[2] = i == 2
|
|
||||||
? 0xC0 + ((utf32_lit >> 6) & 0x1F) /* 110xxxxx */
|
|
||||||
: 0x50 + ((utf32_lit >> 6) & 0x3F); /* 10xxxxxx */
|
|
||||||
case 3: utf8_bytes[3] = i == 3
|
|
||||||
? utf8_bytes[3] = utf32_lit & 0x7F /* 0xxxxxxx */
|
|
||||||
: 0x50 + (utf32_lit & 0x3F); /* 10xxxxxx */
|
|
||||||
}
|
|
||||||
for( ; i < 4; ++i)
|
|
||||||
putc(utf8_bytes[i], stdout);
|
|
||||||
++l;
|
++l;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user