diff --git a/src/common/util/unicode.c b/src/common/util/unicode.c new file mode 100644 index 000000000..4c99ca479 --- /dev/null +++ b/src/common/util/unicode.c @@ -0,0 +1,140 @@ +/* vi: set ts=2: + * +-------------------+ Christian Schlittchen + * | | Enno Rehling + * | Eressea PBEM host | Katja Zedel + * | (c) 1998 - 2007 | + * | | This program may not be used, modified or distributed + * +-------------------+ without prior permission by the authors of Eressea. + * + */ + +#include +#include "unicode.h" + +#include + +/* Convert a UTF-8 encoded character to UCS-4. */ +int +unicode_utf8_to_ucs4(wint_t *ucs4_character, const xmlChar *utf8_string, + size_t *length) +{ + xmlChar utf8_character = utf8_string[0]; + + /* Is the character in the ASCII range? If so, just copy it to the + output. */ + if (utf8_character <= 0x7F) + { + *ucs4_character = utf8_character; + *length = 1; + } + else if ((utf8_character & 0xE0) == 0xC0) + { + /* A two-byte UTF-8 sequence. Make sure the other byte is good. */ + if (utf8_string[1] != '\0' && + (utf8_string[1] & 0xC0) != 0x80) + { + return EILSEQ; + } + + *ucs4_character = + ((utf8_string[1] & 0x3F) << 0) + + ((utf8_character & 0x1F) << 6); + *length = 2; + } + else if ((utf8_character & 0xF0) == 0xE0) + { + /* A three-byte UTF-8 sequence. Make sure the other bytes are + good. */ + if ((utf8_string[1] != '\0') && + (utf8_string[1] & 0xC0) != 0x80 && + (utf8_string[2] != '\0') && + (utf8_string[2] & 0xC0) != 0x80) + { + return EILSEQ; + } + + *ucs4_character = + ((utf8_string[2] & 0x3F) << 0) + + ((utf8_string[1] & 0x3F) << 6) + + ((utf8_character & 0x0F) << 12); + *length = 3; + } + else if ((utf8_character & 0xF8) == 0xF0) + { + /* A four-byte UTF-8 sequence. Make sure the other bytes are + good. */ + if ((utf8_string[1] != '\0') && + (utf8_string[1] & 0xC0) != 0x80 && + (utf8_string[2] != '\0') && + (utf8_string[2] & 0xC0) != 0x80 && + (utf8_string[3] != '\0') && + (utf8_string[3] & 0xC0) != 0x80) + { + return EILSEQ; + } + + *ucs4_character = + ((utf8_string[3] & 0x3F) << 0) + + ((utf8_string[2] & 0x3F) << 6) + + ((utf8_string[1] & 0x3F) << 12) + + ((utf8_character & 0x07) << 18); + *length = 4; + } + else if ((utf8_character & 0xFC) == 0xF8) + { + /* A five-byte UTF-8 sequence. Make sure the other bytes are + good. */ + if ((utf8_string[1] != '\0') && + (utf8_string[1] & 0xC0) != 0x80 && + (utf8_string[2] != '\0') && + (utf8_string[2] & 0xC0) != 0x80 && + (utf8_string[3] != '\0') && + (utf8_string[3] & 0xC0) != 0x80 && + (utf8_string[4] != '\0') && + (utf8_string[4] & 0xC0) != 0x80) + { + return EILSEQ; + } + + *ucs4_character = + ((utf8_string[4] & 0x3F) << 0) + + ((utf8_string[3] & 0x3F) << 6) + + ((utf8_string[2] & 0x3F) << 12) + + ((utf8_string[1] & 0x3F) << 18) + + ((utf8_character & 0x03) << 24); + *length = 5; + } + else if ((utf8_character & 0xFE) == 0xFC) + { + /* A six-byte UTF-8 sequence. Make sure the other bytes are + good. */ + if ((utf8_string[1] != '\0') && + (utf8_string[1] & 0xC0) != 0x80 && + (utf8_string[2] != '\0') && + (utf8_string[2] & 0xC0) != 0x80 && + (utf8_string[3] != '\0') && + (utf8_string[3] & 0xC0) != 0x80 && + (utf8_string[4] != '\0') && + (utf8_string[4] & 0xC0) != 0x80 && + (utf8_string[5] != '\0') && + (utf8_string[5] & 0xC0) != 0x80) + { + return EILSEQ; + } + + *ucs4_character = + ((utf8_string[5] & 0x3F) << 0) + + ((utf8_string[4] & 0x3F) << 6) + + ((utf8_string[3] & 0x3F) << 12) + + ((utf8_string[2] & 0x3F) << 18) + + ((utf8_string[1] & 0x3F) << 24) + + ((utf8_character & 0x01) << 30); + *length = 6; + } + else + { + return EILSEQ; + } + + return 0; +} diff --git a/src/common/util/unicode.h b/src/common/util/unicode.h new file mode 100644 index 000000000..aa404a25e --- /dev/null +++ b/src/common/util/unicode.h @@ -0,0 +1,29 @@ +/* vi: set ts=2: + * + * + * Eressea PB(E)M host Copyright (C) 1998-2003 + * Christian Schlittchen (corwin@amber.kn-bremen.de) + * Katja Zedel (katze@felidae.kn-bremen.de) + * Henning Peters (faroul@beyond.kn-bremen.de) + * Enno Rehling (enno@eressea-pbem.de) + * Ingo Wilken (Ingo.Wilken@informatik.uni-oldenburg.de) + * + * This program may not be used, modified or distributed without + * prior permission by the authors of Eressea. + */ + +#ifndef _UNICODE_H +#define _UNICODE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + + extern int unicode_utf8_to_ucs4(wint_t *ucs4_character, const xmlChar *utf8_string, size_t *length); + +#ifdef __cplusplus +} +#endif +#endif