server/src/util/unicode.c

573 lines
14 KiB
C
Raw Normal View History

2010-08-08 10:06:34 +02:00
/* vi: set ts=2:
* +-------------------+ Christian Schlittchen <corwin@amber.kn-bremen.de>
* | | Enno Rehling <enno@eressea.de>
* | Eressea PBEM host | Katja Zedel <katze@felidae.kn-bremen.de>
* | (c) 1998 - 2007 |
* | | This program may not be used, modified or distributed
* +-------------------+ without prior permission by the authors of Eressea.
*
*/
#include <platform.h>
#include "unicode.h"
#include <errno.h>
#include <wctype.h>
#define B00000000 0x00
#define B10000000 0x80
#define B11000000 0xC0
#define B11100000 0xE0
#define B11110000 0xF0
#define B11111000 0xF8
#define B11111100 0xFC
#define B11111110 0xFE
#define B00111111 0x3F
#define B00011111 0x1F
#define B00001111 0x0F
#define B00000111 0x07
#define B00000011 0x03
#define B00000001 0x01
2011-03-07 08:02:35 +01:00
int unicode_utf8_tolower(utf8_t * op, size_t outlen, const utf8_t * ip)
2010-08-08 10:06:34 +02:00
{
while (*ip) {
ucs4_t ucs = *ip;
ucs4_t low;
size_t size = 1;
if (ucs & 0x80) {
int ret = unicode_utf8_to_ucs4(&ucs, ip, &size);
2011-03-07 08:02:35 +01:00
if (ret != 0) {
2010-08-08 10:06:34 +02:00
return ret;
}
}
2011-03-07 08:02:35 +01:00
if (size > outlen) {
2010-08-08 10:06:34 +02:00
return ENOMEM;
}
2011-03-07 08:02:35 +01:00
low = towlower((wint_t) ucs);
if (low == ucs) {
2010-08-08 10:06:34 +02:00
memcpy(op, ip, size);
ip += size;
op += size;
2011-03-07 08:02:35 +01:00
outlen -= size;
2010-08-08 10:06:34 +02:00
} else {
ip += size;
unicode_ucs4_to_utf8(op, &size, low);
op += size;
2011-03-07 08:02:35 +01:00
outlen -= size;
2010-08-08 10:06:34 +02:00
}
}
2011-03-07 08:02:35 +01:00
if (outlen <= 0) {
2010-08-08 10:06:34 +02:00
return ENOMEM;
}
*op = 0;
return 0;
}
int
2011-03-08 07:31:06 +01:00
unicode_latin1_to_utf8(utf8_t * dst, size_t * outlen, const char *in,
2011-03-07 08:02:35 +01:00
size_t * inlen)
2010-08-08 10:06:34 +02:00
{
int is = (int)*inlen;
int os = (int)*outlen;
2011-03-07 08:02:35 +01:00
const char *ip = in;
2011-03-08 07:31:06 +01:00
unsigned char *out = (unsigned char *)dst, *op = out;
2011-03-07 08:02:35 +01:00
while (ip - in < is) {
2010-08-08 10:06:34 +02:00
unsigned char c = *ip;
if (c > 0xBF) {
2011-03-07 08:02:35 +01:00
if (op - out >= os - 1)
break;
2011-03-08 07:31:06 +01:00
*op++ = 0xC3;
2011-03-07 08:02:35 +01:00
*op++ = c - 64;
} else if (c > 0x7F) {
if (op - out >= os - 1)
break;
2011-03-08 07:31:06 +01:00
*op++ = 0xC2;
2010-08-08 10:06:34 +02:00
*op++ = c;
} else {
2011-03-07 08:02:35 +01:00
if (op - out >= os)
break;
2010-08-08 10:06:34 +02:00
*op++ = c;
}
++ip;
}
2011-03-07 08:02:35 +01:00
*outlen = op - out;
*inlen = ip - in;
2010-08-08 10:06:34 +02:00
return (int)*outlen;
}
2011-03-07 08:02:35 +01:00
int unicode_utf8_strcasecmp(const utf8_t * a, const char *b)
2010-08-08 10:06:34 +02:00
{
while (*a && *b) {
int ret;
size_t size;
ucs4_t ucsa = *a, ucsb = *b;
if (ucsa & 0x80) {
ret = unicode_utf8_to_ucs4(&ucsa, a, &size);
2011-03-07 08:02:35 +01:00
if (ret != 0)
return -1;
2010-08-08 10:06:34 +02:00
a += size;
2011-03-07 08:02:35 +01:00
} else
++a;
2010-08-08 10:06:34 +02:00
if (ucsb & 0x80) {
ret = unicode_utf8_to_ucs4(&ucsb, b, &size);
2011-03-07 08:02:35 +01:00
if (ret != 0)
return -1;
2010-08-08 10:06:34 +02:00
b += size;
2011-03-07 08:02:35 +01:00
} else
++b;
2010-08-08 10:06:34 +02:00
2011-03-07 08:02:35 +01:00
if (ucsb != ucsa) {
ucsb = towlower((wint_t) ucsb);
ucsa = towlower((wint_t) ucsa);
if (ucsb < ucsa)
return 1;
if (ucsb > ucsa)
return -1;
2010-08-08 10:06:34 +02:00
}
}
2011-03-07 08:02:35 +01:00
if (*b)
return -1;
if (*a)
return 1;
2010-08-08 10:06:34 +02:00
return 0;
}
/* Convert a UCS-4 character to UTF-8. */
int
2011-03-07 08:02:35 +01:00
unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size,
ucs4_t ucs4_character)
2010-08-08 10:06:34 +02:00
{
int utf8_bytes;
if (ucs4_character <= 0x0000007F) {
/* 0xxxxxxx */
utf8_bytes = 1;
2011-03-07 08:02:35 +01:00
utf8_character[0] = (char)ucs4_character;
} else if (ucs4_character <= 0x000007FF) {
2010-08-08 10:06:34 +02:00
/* 110xxxxx 10xxxxxx */
utf8_bytes = 2;
2011-03-07 08:02:35 +01:00
utf8_character[0] = (char)((ucs4_character >> 6) | B11000000);
utf8_character[1] = (char)((ucs4_character & B00111111) | B10000000);
} else if (ucs4_character <= 0x0000FFFF) {
2010-08-08 10:06:34 +02:00
/* 1110xxxx 10xxxxxx 10xxxxxx */
utf8_bytes = 3;
2011-03-07 08:02:35 +01:00
utf8_character[0] = (char)((ucs4_character >> 12) | B11100000);
utf8_character[1] = (char)(((ucs4_character >> 6) & B00111111) | B10000000);
utf8_character[2] = (char)((ucs4_character & B00111111) | B10000000);
} else if (ucs4_character <= 0x001FFFFF) {
2010-08-08 10:06:34 +02:00
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
utf8_bytes = 4;
2011-03-07 08:02:35 +01:00
utf8_character[0] = (char)((ucs4_character >> 18) | B11110000);
utf8_character[1] =
(char)(((ucs4_character >> 12) & B00111111) | B10000000);
utf8_character[2] = (char)(((ucs4_character >> 6) & B00111111) | B10000000);
utf8_character[3] = (char)((ucs4_character & B00111111) | B10000000);
} else if (ucs4_character <= 0x03FFFFFF) {
2010-08-08 10:06:34 +02:00
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
utf8_bytes = 5;
2011-03-07 08:02:35 +01:00
utf8_character[0] = (char)((ucs4_character >> 24) | B11111000);
utf8_character[1] =
(char)(((ucs4_character >> 18) & B00111111) | B10000000);
utf8_character[2] =
(char)(((ucs4_character >> 12) & B00111111) | B10000000);
utf8_character[3] = (char)(((ucs4_character >> 6) & B00111111) | B10000000);
utf8_character[4] = (char)((ucs4_character & B00111111) | B10000000);
} else if (ucs4_character <= 0x7FFFFFFF) {
2010-08-08 10:06:34 +02:00
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
utf8_bytes = 6;
2011-03-07 08:02:35 +01:00
utf8_character[0] = (char)((ucs4_character >> 30) | B11111100);
utf8_character[1] =
(char)(((ucs4_character >> 24) & B00111111) | B10000000);
utf8_character[2] =
(char)(((ucs4_character >> 18) & B00111111) | B10000000);
utf8_character[3] =
(char)(((ucs4_character >> 12) & B00111111) | B10000000);
utf8_character[4] = (char)(((ucs4_character >> 6) & B00111111) | B10000000);
utf8_character[5] = (char)((ucs4_character & B00111111) | B10000000);
} else {
2010-08-08 10:06:34 +02:00
return EILSEQ;
}
*size = utf8_bytes;
return 0;
}
/* Convert a UTF-8 encoded character to UCS-4. */
int
2011-03-07 08:02:35 +01:00
unicode_utf8_to_ucs4(ucs4_t * ucs4_character, const utf8_t * utf8_string,
size_t * length)
2010-08-08 10:06:34 +02:00
{
utf8_t utf8_character = utf8_string[0];
/* Is the character in the ASCII range? If so, just copy it to the
2011-03-07 08:02:35 +01:00
output. */
if (~utf8_character & 0x80) {
2010-08-08 10:06:34 +02:00
*ucs4_character = utf8_character;
*length = 1;
2011-03-07 08:02:35 +01:00
} else if ((utf8_character & 0xE0) == 0xC0) {
2010-08-08 10:06:34 +02:00
/* A two-byte UTF-8 sequence. Make sure the other byte is good. */
2011-03-07 08:02:35 +01:00
if (utf8_string[1] != '\0' && (utf8_string[1] & 0xC0) != 0x80) {
2010-08-08 10:06:34 +02:00
return EILSEQ;
}
*ucs4_character =
2011-03-07 08:02:35 +01:00
((utf8_string[1] & 0x3F) << 0) + ((utf8_character & 0x1F) << 6);
2010-08-08 10:06:34 +02:00
*length = 2;
2011-03-07 08:02:35 +01:00
} else if ((utf8_character & 0xF0) == 0xE0) {
2010-08-08 10:06:34 +02:00
/* A three-byte UTF-8 sequence. Make sure the other bytes are
2011-03-07 08:02:35 +01:00
good. */
2010-08-08 10:06:34 +02:00
if ((utf8_string[1] != '\0') &&
(utf8_string[1] & 0xC0) != 0x80 &&
2011-03-07 08:02:35 +01:00
(utf8_string[2] != '\0') && (utf8_string[2] & 0xC0) != 0x80) {
2010-08-08 10:06:34 +02:00
return EILSEQ;
}
2011-03-07 08:02:35 +01:00
*ucs4_character =
2010-08-08 10:06:34 +02:00
((utf8_string[2] & 0x3F) << 0) +
2011-03-07 08:02:35 +01:00
((utf8_string[1] & 0x3F) << 6) + ((utf8_character & 0x0F) << 12);
2010-08-08 10:06:34 +02:00
*length = 3;
2011-03-07 08:02:35 +01:00
} else if ((utf8_character & 0xF8) == 0xF0) {
2010-08-08 10:06:34 +02:00
/* A four-byte UTF-8 sequence. Make sure the other bytes are
2011-03-07 08:02:35 +01:00
good. */
2010-08-08 10:06:34 +02:00
if ((utf8_string[1] != '\0') &&
(utf8_string[1] & 0xC0) != 0x80 &&
(utf8_string[2] != '\0') &&
(utf8_string[2] & 0xC0) != 0x80 &&
2011-03-07 08:02:35 +01:00
(utf8_string[3] != '\0') && (utf8_string[3] & 0xC0) != 0x80) {
2010-08-08 10:06:34 +02:00
return EILSEQ;
}
2011-03-07 08:02:35 +01:00
*ucs4_character =
2010-08-08 10:06:34 +02:00
((utf8_string[3] & 0x3F) << 0) +
((utf8_string[2] & 0x3F) << 6) +
2011-03-07 08:02:35 +01:00
((utf8_string[1] & 0x3F) << 12) + ((utf8_character & 0x07) << 18);
2010-08-08 10:06:34 +02:00
*length = 4;
2011-03-07 08:02:35 +01:00
} else if ((utf8_character & 0xFC) == 0xF8) {
2010-08-08 10:06:34 +02:00
/* A five-byte UTF-8 sequence. Make sure the other bytes are
2011-03-07 08:02:35 +01:00
good. */
2010-08-08 10:06:34 +02:00
if ((utf8_string[1] != '\0') &&
(utf8_string[1] & 0xC0) != 0x80 &&
(utf8_string[2] != '\0') &&
(utf8_string[2] & 0xC0) != 0x80 &&
(utf8_string[3] != '\0') &&
(utf8_string[3] & 0xC0) != 0x80 &&
2011-03-07 08:02:35 +01:00
(utf8_string[4] != '\0') && (utf8_string[4] & 0xC0) != 0x80) {
2010-08-08 10:06:34 +02:00
return EILSEQ;
}
2011-03-07 08:02:35 +01:00
*ucs4_character =
2010-08-08 10:06:34 +02:00
((utf8_string[4] & 0x3F) << 0) +
((utf8_string[3] & 0x3F) << 6) +
((utf8_string[2] & 0x3F) << 12) +
2011-03-07 08:02:35 +01:00
((utf8_string[1] & 0x3F) << 18) + ((utf8_character & 0x03) << 24);
2010-08-08 10:06:34 +02:00
*length = 5;
2011-03-07 08:02:35 +01:00
} else if ((utf8_character & 0xFE) == 0xFC) {
2010-08-08 10:06:34 +02:00
/* A six-byte UTF-8 sequence. Make sure the other bytes are
2011-03-07 08:02:35 +01:00
good. */
2010-08-08 10:06:34 +02:00
if ((utf8_string[1] != '\0') &&
(utf8_string[1] & 0xC0) != 0x80 &&
(utf8_string[2] != '\0') &&
(utf8_string[2] & 0xC0) != 0x80 &&
(utf8_string[3] != '\0') &&
(utf8_string[3] & 0xC0) != 0x80 &&
(utf8_string[4] != '\0') &&
(utf8_string[4] & 0xC0) != 0x80 &&
2011-03-07 08:02:35 +01:00
(utf8_string[5] != '\0') && (utf8_string[5] & 0xC0) != 0x80) {
2010-08-08 10:06:34 +02:00
return EILSEQ;
}
2011-03-07 08:02:35 +01:00
*ucs4_character =
2010-08-08 10:06:34 +02:00
((utf8_string[5] & 0x3F) << 0) +
((utf8_string[4] & 0x3F) << 6) +
((utf8_string[3] & 0x3F) << 12) +
((utf8_string[2] & 0x3F) << 18) +
2011-03-07 08:02:35 +01:00
((utf8_string[1] & 0x3F) << 24) + ((utf8_character & 0x01) << 30);
2010-08-08 10:06:34 +02:00
*length = 6;
2011-03-07 08:02:35 +01:00
} else {
2010-08-08 10:06:34 +02:00
return EILSEQ;
}
return 0;
}
/** Convert a UTF-8 encoded character to CP437. */
int
2011-03-07 08:02:35 +01:00
unicode_utf8_to_cp437(char *cp_character, const utf8_t * utf8_string,
size_t * length)
2010-08-08 10:06:34 +02:00
{
ucs4_t ucs4_character;
int result;
result = unicode_utf8_to_ucs4(&ucs4_character, utf8_string, length);
2011-03-07 08:02:35 +01:00
if (result != 0) {
2010-08-08 10:06:34 +02:00
/* pass decoding characters upstream */
return result;
}
2011-03-07 08:02:35 +01:00
if (ucs4_character < 0x7F) {
2010-08-08 10:06:34 +02:00
*cp_character = (char)ucs4_character;
} else {
2011-03-07 08:02:35 +01:00
struct {
ucs4_t ucs4;
unsigned char cp437;
} xref[160] = {
{
0x00A0, 255}, {
0x00A1, 173}, {
0x00A2, 155}, {
0x00A3, 156}, {
0x00A5, 157}, {
0x00A7, 21}, {
0x00AA, 166}, {
0x00AB, 174}, {
0x00AC, 170}, {
0x00B0, 248}, {
0x00B1, 241}, {
0x00B2, 253}, {
0x00B5, 230}, {
0x00B6, 20}, {
0x00B7, 250}, {
0x00BA, 167}, {
0x00BB, 175}, {
0x00BC, 172}, {
0x00BD, 171}, {
0x00BF, 168}, {
0x00C4, 142}, {
0x00C5, 143}, {
0x00C6, 146}, {
0x00C7, 128}, {
0x00C9, 144}, {
0x00D1, 165}, {
0x00D6, 153}, {
0x00DC, 154}, {
0x00DF, 225}, {
0x00E0, 133}, {
0x00E1, 160}, {
0x00E2, 131}, {
0x00E4, 132}, {
0x00E5, 134}, {
0x00E6, 145}, {
0x00E7, 135}, {
0x00E8, 138}, {
0x00E9, 130}, {
0x00EA, 136}, {
0x00EB, 137}, {
0x00EC, 141}, {
0x00ED, 161}, {
0x00EE, 140}, {
0x00EF, 139}, {
0x00F1, 164}, {
0x00F2, 149}, {
0x00F3, 162}, {
0x00F4, 147}, {
0x00F6, 148}, {
0x00F7, 246}, {
0x00F9, 151}, {
0x00FA, 163}, {
0x00FB, 150}, {
0x00FC, 129}, {
0x00FF, 152}, {
0x0192, 159}, {
0x0393, 226}, {
0x0398, 233}, {
0x03A3, 228}, {
0x03A6, 232}, {
0x03A9, 234}, {
0x03B1, 224}, {
0x03B4, 235}, {
0x03B5, 238}, {
0x03C0, 227}, {
0x03C3, 229}, {
0x03C4, 231}, {
0x03C6, 237}, {
0x2022, 7}, {
0x203C, 19}, {
0x207F, 252}, {
0x20A7, 158}, {
0x2190, 27}, {
0x2191, 24}, {
0x2192, 26}, {
0x2193, 25}, {
0x2194, 29}, {
0x2195, 18}, {
0x21A8, 23}, {
0x2219, 249}, {
0x221A, 251}, {
0x221E, 236}, {
0x221F, 28}, {
0x2229, 239}, {
0x2248, 247}, {
0x2261, 240}, {
0x2264, 243}, {
0x2265, 242}, {
0x2302, 127}, {
0x2310, 169}, {
0x2320, 244}, {
0x2321, 245}, {
0x2500, 196}, {
0x2502, 179}, {
0x250C, 218}, {
0x2510, 191}, {
0x2514, 192}, {
0x2518, 217}, {
0x251C, 195}, {
0x2524, 180}, {
0x252C, 194}, {
0x2534, 193}, {
0x253C, 197}, {
0x2550, 205}, {
0x2551, 186}, {
0x2552, 213}, {
0x2553, 214}, {
0x2554, 201}, {
0x2555, 184}, {
0x2556, 183}, {
0x2557, 187}, {
0x2558, 212}, {
0x2559, 211}, {
0x255A, 200}, {
0x255B, 190}, {
0x255C, 189}, {
0x255D, 188}, {
0x255E, 198}, {
0x255F, 199}, {
0x2560, 204}, {
0x2561, 181}, {
0x2562, 182}, {
0x2563, 185}, {
0x2564, 209}, {
0x2565, 210}, {
0x2566, 203}, {
0x2567, 207}, {
0x2568, 208}, {
0x2569, 202}, {
0x256A, 216}, {
0x256B, 215}, {
0x256C, 206}, {
0x2580, 223}, {
0x2584, 220}, {
0x2588, 219}, {
0x258C, 221}, {
0x2590, 222}, {
0x2591, 176}, {
0x2592, 177}, {
0x2593, 178}, {
0x25A0, 254}, {
0x25AC, 22}, {
0x25B2, 30}, {
0x25BA, 16}, {
0x25BC, 31}, {
0x25C4, 17}, {
0x25CB, 9}, {
0x25D8, 8}, {
0x25D9, 10}, {
0x263A, 1}, {
0x263B, 2}, {
0x263C, 15}, {
0x2640, 12}, {
0x2642, 11}, {
0x2660, 6}, {
0x2663, 5}, {
0x2665, 3}, {
0x2666, 4}, {
0x266A, 13}, {
0x266B, 14}
2010-08-08 10:06:34 +02:00
};
2011-03-07 08:02:35 +01:00
int l = 0, r = 160;
while (l != r) {
int m = (l + r) / 2;
if (xref[m].ucs4 == ucs4_character) {
2010-08-08 10:06:34 +02:00
*cp_character = (char)xref[m].cp437;
break;
2011-03-07 08:02:35 +01:00
} else if (xref[m].ucs4 < ucs4_character)
l = m;
else
r = m;
2010-08-08 10:06:34 +02:00
}
2011-03-07 08:02:35 +01:00
if (l == r) {
2010-08-08 10:06:34 +02:00
*cp_character = '?';
}
}
return 0;
}
/** Convert a UTF-8 encoded character to CP1252. */
int
2011-03-07 08:02:35 +01:00
unicode_utf8_to_cp1252(char *cp_character, const utf8_t * utf8_string,
size_t * length)
2010-08-08 10:06:34 +02:00
{
ucs4_t ucs4_character;
int result;
result = unicode_utf8_to_ucs4(&ucs4_character, utf8_string, length);
2011-03-07 08:02:35 +01:00
if (result != 0) {
2010-08-08 10:06:34 +02:00
/* pass decoding characters upstream */
return result;
}
2011-03-07 08:02:35 +01:00
if (ucs4_character <= 0x7F || ucs4_character >= 0xA0) {
2010-08-08 10:06:34 +02:00
*cp_character = (char)ucs4_character;
} else {
2011-03-07 08:02:35 +01:00
struct {
ucs4_t ucs4;
unsigned char cp;
} xref[] = {
{
0x20ac, 0x80}, {
0x0081, 0x81}, {
0x201a, 0x82}, {
0x0192, 0x83}, {
0x201e, 0x84}, {
0x2026, 0x85}, {
0x2020, 0x86}, {
0x2021, 0x87}, {
0x02c6, 0x88}, {
0x2030, 0x89}, {
0x0160, 0x8a}, {
0x2039, 0x8b}, {
0x0152, 0x8c}, {
0x008d, 0x8d}, {
0x017d, 0x8e}, {
0x008f, 0x8f}, {
0x0090, 0x90}, {
0x2018, 0x91}, {
0x2019, 0x92}, {
0x201c, 0x93}, {
0x201d, 0x94}, {
0x2022, 0x95}, {
0x2013, 0x96}, {
0x2014, 0x97}, {
0x02dc, 0x98}, {
0x2122, 0x99}, {
0x0161, 0x9a}, {
0x203a, 0x9b}, {
0x0153, 0x9c}, {
0x009d, 0x9d}, {
0x017e, 0x9e}, {
0x0178, 0x9f}
2010-08-08 10:06:34 +02:00
};
2011-03-07 08:02:35 +01:00
int l = 0, r = sizeof(xref) / sizeof(xref[0]);
while (l != r) {
int m = (l + r) / 2;
if (xref[m].ucs4 == ucs4_character) {
2010-08-08 10:06:34 +02:00
*cp_character = (char)xref[m].cp;
break;
2011-03-07 08:02:35 +01:00
} else if (xref[m].ucs4 < ucs4_character)
l = m;
else
r = m;
2010-08-08 10:06:34 +02:00
}
2011-03-07 08:02:35 +01:00
if (l == r) {
2010-08-08 10:06:34 +02:00
*cp_character = '?';
}
}
return 0;
}