2007-06-26 02:22:31 +02:00
|
|
|
/* vi: set ts=2:
|
|
|
|
* +-------------------+ Christian Schlittchen <corwin@amber.kn-bremen.de>
|
|
|
|
* | | Enno Rehling <enno@eressea.de>
|
|
|
|
* | Eressea PBEM host | Katja Zedel <katze@felidae.kn-bremen.de>
|
|
|
|
* | (c) 1998 - 2007 |
|
|
|
|
* | | This program may not be used, modified or distributed
|
|
|
|
* +-------------------+ without prior permission by the authors of Eressea.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
#include "unicode.h"
|
|
|
|
|
|
|
|
#include <errno.h>
|
2007-08-18 16:54:35 +02:00
|
|
|
#include <wctype.h>
|
2007-06-26 02:22:31 +02:00
|
|
|
|
2007-08-09 08:42:37 +02:00
|
|
|
int
|
2007-08-10 09:03:23 +02:00
|
|
|
unicode_utf8_strcasecmp(const char * a, const char * b)
|
2007-08-09 08:42:37 +02:00
|
|
|
{
|
|
|
|
while (*a && *b) {
|
|
|
|
int ret;
|
|
|
|
size_t size;
|
|
|
|
wint_t ucsa = *a, ucsb = *b;
|
|
|
|
|
2007-08-10 09:03:23 +02:00
|
|
|
if (ucsa & 0x80) {
|
2007-08-09 08:42:37 +02:00
|
|
|
ret = unicode_utf8_to_ucs4(&ucsa, a, &size);
|
|
|
|
if (ret!=0) return -1;
|
|
|
|
a += size;
|
|
|
|
} else ++a;
|
2007-08-10 09:03:23 +02:00
|
|
|
if (ucsb & 0x80) {
|
2007-08-09 08:42:37 +02:00
|
|
|
ret = unicode_utf8_to_ucs4(&ucsb, b, &size);
|
|
|
|
if (ret!=0) return -1;
|
|
|
|
b += size;
|
|
|
|
} else ++b;
|
|
|
|
|
|
|
|
if (ucsb!=ucsa) {
|
|
|
|
ucsb = towlower(ucsb);
|
|
|
|
ucsa = towlower(ucsa);
|
|
|
|
if (ucsb<ucsa) return 1;
|
|
|
|
if (ucsb>ucsa) return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (*b) return -1;
|
|
|
|
if (*a) return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-06-26 02:22:31 +02:00
|
|
|
/* Convert a UTF-8 encoded character to UCS-4. */
|
|
|
|
int
|
2007-08-10 09:03:23 +02:00
|
|
|
unicode_utf8_to_ucs4(wint_t *ucs4_character, const char *utf8_string,
|
2007-06-26 02:22:31 +02:00
|
|
|
size_t *length)
|
|
|
|
{
|
2007-08-10 09:03:23 +02:00
|
|
|
unsigned char utf8_character = (unsigned char)utf8_string[0];
|
2007-06-26 02:22:31 +02:00
|
|
|
|
|
|
|
/* Is the character in the ASCII range? If so, just copy it to the
|
|
|
|
output. */
|
2007-08-10 09:03:23 +02:00
|
|
|
if (~utf8_character & 0x80)
|
2007-06-26 02:22:31 +02:00
|
|
|
{
|
|
|
|
*ucs4_character = utf8_character;
|
|
|
|
*length = 1;
|
|
|
|
}
|
|
|
|
else if ((utf8_character & 0xE0) == 0xC0)
|
|
|
|
{
|
|
|
|
/* A two-byte UTF-8 sequence. Make sure the other byte is good. */
|
|
|
|
if (utf8_string[1] != '\0' &&
|
|
|
|
(utf8_string[1] & 0xC0) != 0x80)
|
|
|
|
{
|
|
|
|
return EILSEQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
*ucs4_character =
|
|
|
|
((utf8_string[1] & 0x3F) << 0) +
|
|
|
|
((utf8_character & 0x1F) << 6);
|
|
|
|
*length = 2;
|
|
|
|
}
|
|
|
|
else if ((utf8_character & 0xF0) == 0xE0)
|
|
|
|
{
|
|
|
|
/* A three-byte UTF-8 sequence. Make sure the other bytes are
|
|
|
|
good. */
|
|
|
|
if ((utf8_string[1] != '\0') &&
|
|
|
|
(utf8_string[1] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[2] != '\0') &&
|
|
|
|
(utf8_string[2] & 0xC0) != 0x80)
|
|
|
|
{
|
|
|
|
return EILSEQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
*ucs4_character =
|
|
|
|
((utf8_string[2] & 0x3F) << 0) +
|
|
|
|
((utf8_string[1] & 0x3F) << 6) +
|
|
|
|
((utf8_character & 0x0F) << 12);
|
|
|
|
*length = 3;
|
|
|
|
}
|
|
|
|
else if ((utf8_character & 0xF8) == 0xF0)
|
|
|
|
{
|
|
|
|
/* A four-byte UTF-8 sequence. Make sure the other bytes are
|
|
|
|
good. */
|
|
|
|
if ((utf8_string[1] != '\0') &&
|
|
|
|
(utf8_string[1] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[2] != '\0') &&
|
|
|
|
(utf8_string[2] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[3] != '\0') &&
|
|
|
|
(utf8_string[3] & 0xC0) != 0x80)
|
|
|
|
{
|
|
|
|
return EILSEQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
*ucs4_character =
|
|
|
|
((utf8_string[3] & 0x3F) << 0) +
|
|
|
|
((utf8_string[2] & 0x3F) << 6) +
|
|
|
|
((utf8_string[1] & 0x3F) << 12) +
|
|
|
|
((utf8_character & 0x07) << 18);
|
|
|
|
*length = 4;
|
|
|
|
}
|
|
|
|
else if ((utf8_character & 0xFC) == 0xF8)
|
|
|
|
{
|
|
|
|
/* A five-byte UTF-8 sequence. Make sure the other bytes are
|
|
|
|
good. */
|
|
|
|
if ((utf8_string[1] != '\0') &&
|
|
|
|
(utf8_string[1] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[2] != '\0') &&
|
|
|
|
(utf8_string[2] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[3] != '\0') &&
|
|
|
|
(utf8_string[3] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[4] != '\0') &&
|
|
|
|
(utf8_string[4] & 0xC0) != 0x80)
|
|
|
|
{
|
|
|
|
return EILSEQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
*ucs4_character =
|
|
|
|
((utf8_string[4] & 0x3F) << 0) +
|
|
|
|
((utf8_string[3] & 0x3F) << 6) +
|
|
|
|
((utf8_string[2] & 0x3F) << 12) +
|
|
|
|
((utf8_string[1] & 0x3F) << 18) +
|
|
|
|
((utf8_character & 0x03) << 24);
|
|
|
|
*length = 5;
|
|
|
|
}
|
|
|
|
else if ((utf8_character & 0xFE) == 0xFC)
|
|
|
|
{
|
|
|
|
/* A six-byte UTF-8 sequence. Make sure the other bytes are
|
|
|
|
good. */
|
|
|
|
if ((utf8_string[1] != '\0') &&
|
|
|
|
(utf8_string[1] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[2] != '\0') &&
|
|
|
|
(utf8_string[2] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[3] != '\0') &&
|
|
|
|
(utf8_string[3] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[4] != '\0') &&
|
|
|
|
(utf8_string[4] & 0xC0) != 0x80 &&
|
|
|
|
(utf8_string[5] != '\0') &&
|
|
|
|
(utf8_string[5] & 0xC0) != 0x80)
|
|
|
|
{
|
|
|
|
return EILSEQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
*ucs4_character =
|
|
|
|
((utf8_string[5] & 0x3F) << 0) +
|
|
|
|
((utf8_string[4] & 0x3F) << 6) +
|
|
|
|
((utf8_string[3] & 0x3F) << 12) +
|
|
|
|
((utf8_string[2] & 0x3F) << 18) +
|
|
|
|
((utf8_string[1] & 0x3F) << 24) +
|
|
|
|
((utf8_character & 0x01) << 30);
|
|
|
|
*length = 6;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return EILSEQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|