split out utf8 encoding into library

trinity/src

Fork 0

Browse Source

This commit is contained in:

dtb

2023-09-04 08:47:52 -04:00

parent 3b3cd42e69

commit 0697a13fcd

4 changed files with 58 additions and 27 deletions

									
										14

unicode/Makefile
									
												View File
												
					@@ -1,2 +1,12 @@

					utf8: utf8.c

					utf8: utf8.o libunicode.o

						$(CC) -I../ascii -g -o utf8 utf8.c

						$(CC) -g -o utf8 libunicode.o utf8.o

					libunicode.o:

					utf8.o: libunicode.h utf8.c

						$(CC) -I../ascii -c -g -o utf8.o utf8.c

					clean:

						rm -f *.o utf8

					.PHONY: clean

									
										21

unicode/libunicode.c
									
										Normal file
									
												View File
												
					@@ -0,0 +1,21 @@

					#include "libunicode.h"

					utf8_t utf8(utf32_t c){

						unsigned char n;

						utf8_t r;

						r = 0;

						switch(n = (c < 0x10000) + (c < 0x0800) + (c < 0x0080)){

						case 0: r = 0xF0 + ((c >> 18) & 0x07); /* 11110xxx */

						case 1:	r = (r << 8) + (n == 1

							? 0xE0 + ((c >> 12) & 0x0F) /* 1110xxxx */

							: 0x50 + ((c >> 12) & 0x3F)); /* 10xxxxxx */

						case 2:	r = (r << 8) + (n == 2

							? 0xC0 + ((c >> 6) & 0x1F) /* 110xxxxx */

							: 0x50 + ((c >> 6) & 0x3F)); /* 10xxxxxx */

						case 3:	r = (r << 8) + (n == 3

							? c & 0x7F /* 0xxxxxxx */

							: 0x50 + (c & 0x3F)); /* 10xxxxxx */

						}

						return r;

					}

									
										14

unicode/libunicode.h
									
										Normal file
									
												View File
												
					@@ -0,0 +1,14 @@

					#if defined UINT32_MAX /* indicator <stdint.h> is included */

					#	define unicode_codepoint_t uint32_t

					#else

					 /* C99 draft 5.2.4.2.1 Sizes of integer types says unsigned long must be able

					  * to hold 32b */

					#	define unicode_codepoint_t unsigned long

					#endif

					#define utf32_t unicode_codepoint_t

					/* for holding the literal numeric value of a utf8 rune, not for assembling

					 * strings; >=32b value */

					#define utf8_t unicode_codepoint_t

					/* encode UTF-32 value into UTF-8 */

					utf8_t utf8(utf32_t c);

									
										36

unicode/utf8.c
									
												View File
												
					@@ -1,7 +1,8 @@

					#include <stdio.h> /* fprintf(3), getc(3), stderr, stdin, EOF */

					#include <stdio.h> /* fprintf(3), getc(3), putc(3) stderr, stdin, EOF */

					#include <string.h> /* memset(3) */

					#include <string.h> /* strchr(3) */

					#include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER,

					#include "ascii.h" /* ASCII_HEXADECIMAL_DIGITS_LOWER,

					                    * ASCII_HEXADECIMAL_DIGITS_UPPER */

					                    * ASCII_HEXADECIMAL_DIGITS_UPPER */

					#include "libunicode.h" /* utf8(3) */

					#define SKIPLINE while((c = getc(stdin)) != '\n' && c != EOF)

					#define SKIPLINE while((c = getc(stdin)) != '\n' && c != EOF)

					static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER

					static char *hex = ASCII_HEXADECIMAL_DIGITS_UPPER

					@@ -12,16 +13,15 @@ int main(int argc, char *argv[]){

						int i;

						int i;

						int l; /* line counter */

						int l; /* line counter */

						char *n;

						char *n;

						long unsigned int utf32_lit;

						utf32_t codepoint;

						unsigned char utf8_bytes[8];

						utf8_t encoded;

						c = '\0';

						c = '\0';

						i = 0;

						i = 0;

						l = 1;

						l = 1;

						while(c != EOF){

						while(c != EOF){

							memset(utf8_bytes, '0', sizeof utf8_bytes);

							for(

							for(

								i = 0, n = NULL, utf32_lit = 0;

								i = 0, n = NULL, codepoint = 0;

								(c = getc(stdin)) != '\n'

								(c = getc(stdin)) != '\n'

								&& c != EOF

								&& c != EOF

								&& i < 10;

								&& i < 10;

					@@ -41,7 +41,7 @@ int main(int argc, char *argv[]){

									break;

									break;

								}

								}

								if(n != NULL)

								if(n != NULL)

									utf32_lit = (utf32_lit << 4) + (n - hex) % 16;

									codepoint = (codepoint << 4) + (n - hex) % 16;

							}

							}

							if(i < 3){

							if(i < 3){

								if(c != '\n' && c != EOF)

								if(c != '\n' && c != EOF)

					@@ -50,24 +50,10 @@ int main(int argc, char *argv[]){

									return 0;

									return 0;

								continue;

								continue;

							}

							}

							/* something with the bit math is broken */

							encoded = utf8(codepoint);

							switch(i =	(utf32_lit < 0x10000)

							for(i = 3; i >= 0; --i)

								+ (utf32_lit < 0x0800)

								if((encoded >> 8 * i) > 0 || i == 0)

								+ (utf32_lit < 0x0080)){

									putc(encoded >> 8 * i, stdout);

							case 0: utf8_bytes[0] =

								0xF0 + ((utf32_lit >> 18) & 0x07); /* 11110xxx */

							case 1:	utf8_bytes[1] = i == 1

								? 0xE0 + ((utf32_lit >> 12) & 0x0F) /* 1110xxxx */

								: 0x50 + ((utf32_lit >> 12) & 0x3F); /* 10xxxxxx */

							case 2:	utf8_bytes[2] = i == 2

								? 0xC0 + ((utf32_lit >> 6) & 0x1F) /* 110xxxxx */

								: 0x50 + ((utf32_lit >> 6) & 0x3F); /* 10xxxxxx */

							case 3:	utf8_bytes[3] = i == 3

								? utf8_bytes[3] = utf32_lit & 0x7F /* 0xxxxxxx */

								: 0x50 + (utf32_lit & 0x3F); /* 10xxxxxx */

							}

							for( ; i < 4; ++i)

								putc(utf8_bytes[i], stdout);

							++l;

							++l;

						}

						}

					}

					}

split out utf8 encoding into library

14 unicode/Makefile Unescape Escape View File

21 unicode/libunicode.c Normal file Unescape Escape View File

14 unicode/libunicode.h Normal file Unescape Escape View File

36 unicode/utf8.c Unescape Escape View File

14

unicode/Makefile

View File

21

unicode/libunicode.c Normal file

View File

14

unicode/libunicode.h Normal file

View File

36

unicode/utf8.c

View File