forked from github/server
use wint_t, not long.
This commit is contained in:
parent
0fef06a661
commit
55299346f8
7 changed files with 147 additions and 104 deletions
|
@ -177,11 +177,11 @@ static int count_umlaut(const char *s)
|
||||||
int result = 0;
|
int result = 0;
|
||||||
const char *cp;
|
const char *cp;
|
||||||
for (cp = s; *cp; ++cp) {
|
for (cp = s; *cp; ++cp) {
|
||||||
ucs4_t ucs = *cp;
|
wint_t wc = *cp;
|
||||||
if (ucs & 0x80) {
|
if (wc & 0x80) {
|
||||||
size_t size;
|
size_t size;
|
||||||
int err;
|
int err;
|
||||||
err = unicode_utf8_to_ucs4(&ucs, cp, &size);
|
err = unicode_utf8_decode(&wc, cp, &size);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
log_error("illegal utf8 encoding %s at %s", s, cp);
|
log_error("illegal utf8 encoding %s at %s", s, cp);
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -26,12 +26,12 @@ static int eatwhite(const char *ptr, size_t * total_size)
|
||||||
*total_size = 0;
|
*total_size = 0;
|
||||||
|
|
||||||
while (*ptr) {
|
while (*ptr) {
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
ret = unicode_utf8_to_ucs4(&ucs, ptr, &size);
|
ret = unicode_utf8_decode(&wc, ptr, &size);
|
||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
break;
|
break;
|
||||||
if (!iswspace((wint_t)ucs))
|
if (!iswspace(wc))
|
||||||
break;
|
break;
|
||||||
*total_size += size;
|
*total_size += size;
|
||||||
ptr += size;
|
ptr += size;
|
||||||
|
@ -86,7 +86,7 @@ static const char *getbuf_utf8(FILE * F)
|
||||||
}
|
}
|
||||||
cont = false;
|
cont = false;
|
||||||
while (*bp && cp < fbuf + MAXLINE) {
|
while (*bp && cp < fbuf + MAXLINE) {
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
size_t size;
|
size_t size;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
@ -119,14 +119,14 @@ static const char *getbuf_utf8(FILE * F)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = unicode_utf8_to_ucs4(&ucs, bp, &size);
|
ret = unicode_utf8_decode(&wc, bp, &size);
|
||||||
|
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
unicode_warning(bp);
|
unicode_warning(bp);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (iswspace((wint_t)ucs)) {
|
if (iswspace(wc)) {
|
||||||
if (!quote) {
|
if (!quote) {
|
||||||
bp += size;
|
bp += size;
|
||||||
ret = eatwhite(bp, &size);
|
ret = eatwhite(bp, &size);
|
||||||
|
@ -151,7 +151,7 @@ static const char *getbuf_utf8(FILE * F)
|
||||||
bp += size;
|
bp += size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (iswcntrl((wint_t)ucs)) {
|
else if (iswcntrl(wc)) {
|
||||||
if (!comment && cp < fbuf + MAXLINE) {
|
if (!comment && cp < fbuf + MAXLINE) {
|
||||||
*cp++ = '?';
|
*cp++ = '?';
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,7 +27,7 @@ static parse_state *states;
|
||||||
static int eatwhitespace_c(const char **str_p)
|
static int eatwhitespace_c(const char **str_p)
|
||||||
{
|
{
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
size_t len;
|
size_t len;
|
||||||
const char *str = *str_p;
|
const char *str = *str_p;
|
||||||
|
|
||||||
|
@ -40,12 +40,12 @@ static int eatwhitespace_c(const char **str_p)
|
||||||
++str;
|
++str;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret = unicode_utf8_to_ucs4(&ucs, str, &len);
|
ret = unicode_utf8_decode(&wc, str, &len);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
log_warning("illegal character sequence in UTF8 string: %s\n", str);
|
log_warning("illegal character sequence in UTF8 string: %s\n", str);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!iswspace((wint_t)ucs))
|
if (!iswspace(wc))
|
||||||
break;
|
break;
|
||||||
str += len;
|
str += len;
|
||||||
}
|
}
|
||||||
|
@ -106,16 +106,16 @@ void skip_token(void)
|
||||||
eatwhitespace_c(&states->current_token);
|
eatwhitespace_c(&states->current_token);
|
||||||
|
|
||||||
while (*states->current_token) {
|
while (*states->current_token) {
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
size_t len;
|
size_t len;
|
||||||
|
|
||||||
unsigned char utf8_character = (unsigned char)states->current_token[0];
|
unsigned char utf8_character = (unsigned char)states->current_token[0];
|
||||||
if (~utf8_character & 0x80) {
|
if (~utf8_character & 0x80) {
|
||||||
ucs = utf8_character;
|
wc = utf8_character;
|
||||||
++states->current_token;
|
++states->current_token;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int ret = unicode_utf8_to_ucs4(&ucs, states->current_token, &len);
|
int ret = unicode_utf8_decode(&wc, states->current_token, &len);
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
states->current_token += len;
|
states->current_token += len;
|
||||||
}
|
}
|
||||||
|
@ -123,7 +123,7 @@ void skip_token(void)
|
||||||
log_warning("illegal character sequence in UTF8 string: %s\n", states->current_token);
|
log_warning("illegal character sequence in UTF8 string: %s\n", states->current_token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (iswspace((wint_t)ucs) && quotechar == 0) {
|
if (iswspace(wc) && quotechar == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -160,17 +160,17 @@ char *parse_token(const char **str, char *lbuf, size_t buflen)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
while (*ctoken) {
|
while (*ctoken) {
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
size_t len;
|
size_t len;
|
||||||
bool copy = false;
|
bool copy = false;
|
||||||
|
|
||||||
unsigned char utf8_character = *(unsigned char *)ctoken;
|
unsigned char utf8_character = *(unsigned char *)ctoken;
|
||||||
if (~utf8_character & 0x80) {
|
if (~utf8_character & 0x80) {
|
||||||
ucs = utf8_character;
|
wc = utf8_character;
|
||||||
len = 1;
|
len = 1;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);
|
int ret = unicode_utf8_decode(&wc, ctoken, &len);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
log_warning("illegal character sequence in UTF8 string: %s\n", ctoken);
|
log_warning("illegal character sequence in UTF8 string: %s\n", ctoken);
|
||||||
break;
|
break;
|
||||||
|
@ -180,7 +180,7 @@ char *parse_token(const char **str, char *lbuf, size_t buflen)
|
||||||
copy = true;
|
copy = true;
|
||||||
escape = false;
|
escape = false;
|
||||||
}
|
}
|
||||||
else if (iswspace((wint_t)ucs)) {
|
else if (iswspace(wc)) {
|
||||||
if (quotechar == 0)
|
if (quotechar == 0)
|
||||||
break;
|
break;
|
||||||
copy = true;
|
copy = true;
|
||||||
|
|
|
@ -32,7 +32,7 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
|
||||||
typedef struct tref {
|
typedef struct tref {
|
||||||
struct tref *nexthash;
|
struct tref *nexthash;
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
struct tnode *node;
|
struct tnode *node;
|
||||||
} tref;
|
} tref;
|
||||||
|
|
||||||
|
@ -99,8 +99,8 @@ char * transliterate(char * out, size_t size, const char * in)
|
||||||
size -= advance;
|
size -= advance;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
int ret = unicode_utf8_to_ucs4(&ucs, src, &len);
|
int ret = unicode_utf8_decode(&wc, src, &len);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
/* encoding is broken. yikes */
|
/* encoding is broken. yikes */
|
||||||
log_error("transliterate | encoding error in '%s'\n", src);
|
log_error("transliterate | encoding error in '%s'\n", src);
|
||||||
|
@ -127,7 +127,7 @@ void addtoken(tnode ** root, const char *str, variant id)
|
||||||
{
|
{
|
||||||
tnode * tk;
|
tnode * tk;
|
||||||
static const struct replace {
|
static const struct replace {
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
const char str[3];
|
const char str[3];
|
||||||
} replace[] = {
|
} replace[] = {
|
||||||
/* match lower-case (!) umlauts and others to transcriptions */
|
/* match lower-case (!) umlauts and others to transcriptions */
|
||||||
|
@ -150,10 +150,10 @@ void addtoken(tnode ** root, const char *str, variant id)
|
||||||
else {
|
else {
|
||||||
tref *next;
|
tref *next;
|
||||||
int ret, index, i = 0;
|
int ret, index, i = 0;
|
||||||
ucs4_t ucs, lcs;
|
wint_t ucs, lcs;
|
||||||
size_t len;
|
size_t len;
|
||||||
|
|
||||||
ret = unicode_utf8_to_ucs4(&ucs, str, &len);
|
ret = unicode_utf8_decode(&ucs, str, &len);
|
||||||
assert(ret == 0 || !"invalid utf8 string");
|
assert(ret == 0 || !"invalid utf8 string");
|
||||||
lcs = ucs;
|
lcs = ucs;
|
||||||
|
|
||||||
|
@ -166,7 +166,7 @@ void addtoken(tnode ** root, const char *str, variant id)
|
||||||
next = tk->next[index];
|
next = tk->next[index];
|
||||||
if (!(tk->flags & LEAF))
|
if (!(tk->flags & LEAF))
|
||||||
tk->id = id;
|
tk->id = id;
|
||||||
while (next && next->ucs != ucs)
|
while (next && next->wc != ucs)
|
||||||
next = next->nexthash;
|
next = next->nexthash;
|
||||||
if (!next) {
|
if (!next) {
|
||||||
tref *ref;
|
tref *ref;
|
||||||
|
@ -181,7 +181,7 @@ void addtoken(tnode ** root, const char *str, variant id)
|
||||||
|
|
||||||
ref = (tref *)malloc(sizeof(tref));
|
ref = (tref *)malloc(sizeof(tref));
|
||||||
if (!ref) abort();
|
if (!ref) abort();
|
||||||
ref->ucs = ucs;
|
ref->wc = ucs;
|
||||||
ref->node = node;
|
ref->node = node;
|
||||||
ref->nexthash = tk->next[index];
|
ref->nexthash = tk->next[index];
|
||||||
tk->next[index] = ref;
|
tk->next[index] = ref;
|
||||||
|
@ -195,7 +195,7 @@ void addtoken(tnode ** root, const char *str, variant id)
|
||||||
#endif
|
#endif
|
||||||
ref = (tref *)malloc(sizeof(tref));
|
ref = (tref *)malloc(sizeof(tref));
|
||||||
assert_alloc(ref);
|
assert_alloc(ref);
|
||||||
ref->ucs = lcs;
|
ref->wc = lcs;
|
||||||
ref->node = node;
|
ref->node = node;
|
||||||
++node->refcount;
|
++node->refcount;
|
||||||
ref->nexthash = tk->next[index];
|
ref->nexthash = tk->next[index];
|
||||||
|
@ -211,7 +211,7 @@ void addtoken(tnode ** root, const char *str, variant id)
|
||||||
}
|
}
|
||||||
addtoken(&next->node, str + len, id);
|
addtoken(&next->node, str + len, id);
|
||||||
while (replace[i].str[0]) {
|
while (replace[i].str[0]) {
|
||||||
if (lcs == replace[i].ucs) {
|
if (lcs == replace[i].wc) {
|
||||||
char zText[1024];
|
char zText[1024];
|
||||||
memcpy(zText, replace[i].str, 3);
|
memcpy(zText, replace[i].str, 3);
|
||||||
str_strlcpy(zText + 2, (const char *)str + len, sizeof(zText)-2);
|
str_strlcpy(zText + 2, (const char *)str + len, sizeof(zText)-2);
|
||||||
|
@ -255,9 +255,9 @@ int findtoken(const void * root, const char *key, variant * result)
|
||||||
do {
|
do {
|
||||||
int index;
|
int index;
|
||||||
const tref *ref;
|
const tref *ref;
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
size_t len;
|
size_t len;
|
||||||
int ret = unicode_utf8_to_ucs4(&ucs, str, &len);
|
int ret = unicode_utf8_decode(&wc, str, &len);
|
||||||
|
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
/* encoding is broken. youch */
|
/* encoding is broken. youch */
|
||||||
|
@ -265,12 +265,12 @@ int findtoken(const void * root, const char *key, variant * result)
|
||||||
return E_TOK_NOMATCH;
|
return E_TOK_NOMATCH;
|
||||||
}
|
}
|
||||||
#if NODEHASHSIZE == 8
|
#if NODEHASHSIZE == 8
|
||||||
index = ucs & 7;
|
index = wc & 7;
|
||||||
#else
|
#else
|
||||||
index = ucs % NODEHASHSIZE;
|
index = wc % NODEHASHSIZE;
|
||||||
#endif
|
#endif
|
||||||
ref = tk->next[index];
|
ref = tk->next[index];
|
||||||
while (ref && ref->ucs != ucs)
|
while (ref && ref->wc != wc)
|
||||||
ref = ref->nexthash;
|
ref = ref->nexthash;
|
||||||
str += len;
|
str += len;
|
||||||
if (!ref) {
|
if (!ref) {
|
||||||
|
|
|
@ -33,18 +33,18 @@
|
||||||
#define B00000011 0x03
|
#define B00000011 0x03
|
||||||
#define B00000001 0x01
|
#define B00000001 0x01
|
||||||
|
|
||||||
int unicode_utf8_trim(utf8_t *buf)
|
size_t unicode_utf8_trim(char *buf)
|
||||||
{
|
{
|
||||||
int result = 0, ts = 0;
|
int result = 0, ts = 0;
|
||||||
utf8_t *op = buf, *ip = buf, *lc = buf;
|
char *op = buf, *ip = buf, *lc = buf;
|
||||||
assert(buf);
|
assert(buf);
|
||||||
while (*ip) {
|
while (*ip) {
|
||||||
size_t size = 1;
|
size_t size = 1;
|
||||||
wint_t wc = *ip;
|
wint_t wc = *ip;
|
||||||
if (wc & 0x80) {
|
if (wc & 0x80) {
|
||||||
ucs4_t ucs = 0;
|
wint_t ucs = 0;
|
||||||
if (ip[1]) {
|
if (ip[1]) {
|
||||||
int ret = unicode_utf8_to_ucs4(&ucs, ip, &size);
|
int ret = unicode_utf8_decode(&ucs, ip, &size);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -56,22 +56,24 @@ int unicode_utf8_trim(utf8_t *buf)
|
||||||
++result;
|
++result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (op == buf && iswspace(wc)) {
|
if (op == buf && (iswcntrl(wc) || iswspace(wc))) {
|
||||||
++result;
|
result += size;
|
||||||
}
|
}
|
||||||
else if (wc>255 || !iscntrl(wc)) {
|
else if (wc>255 || !iscntrl(wc)) {
|
||||||
if (op != ip) {
|
if (op != ip) {
|
||||||
memmove(op, ip, size);
|
memmove(op, ip, size);
|
||||||
}
|
}
|
||||||
op += size;
|
op += size;
|
||||||
if (iswspace(wc)) ++ts;
|
if (iswcntrl(wc) || iswspace(wc)) {
|
||||||
|
ts += size;
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
lc = op;
|
lc = op;
|
||||||
ts = 0;
|
ts = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
++result;
|
result += size;
|
||||||
}
|
}
|
||||||
ip += size;
|
ip += size;
|
||||||
}
|
}
|
||||||
|
@ -79,15 +81,15 @@ int unicode_utf8_trim(utf8_t *buf)
|
||||||
return result + ts;
|
return result + ts;
|
||||||
}
|
}
|
||||||
|
|
||||||
int unicode_utf8_tolower(utf8_t * op, size_t outlen, const utf8_t * ip)
|
int unicode_utf8_tolower(char * op, size_t outlen, const char * ip)
|
||||||
{
|
{
|
||||||
while (*ip) {
|
while (*ip) {
|
||||||
ucs4_t ucs = *ip;
|
wint_t ucs = *ip;
|
||||||
ucs4_t low;
|
wint_t low;
|
||||||
size_t size = 1;
|
size_t size = 1;
|
||||||
|
|
||||||
if (ucs & 0x80) {
|
if (ucs & 0x80) {
|
||||||
int ret = unicode_utf8_to_ucs4(&ucs, ip, &size);
|
int ret = unicode_utf8_decode(&ucs, ip, &size);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -104,7 +106,7 @@ int unicode_utf8_tolower(utf8_t * op, size_t outlen, const utf8_t * ip)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ip += size;
|
ip += size;
|
||||||
unicode_ucs4_to_utf8(op, &size, low);
|
unicode_utf8_encode(op, &size, low);
|
||||||
op += size;
|
op += size;
|
||||||
outlen -= size;
|
outlen -= size;
|
||||||
}
|
}
|
||||||
|
@ -114,7 +116,7 @@ int unicode_utf8_tolower(utf8_t * op, size_t outlen, const utf8_t * ip)
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
unicode_latin1_to_utf8(utf8_t * dst, size_t * outlen, const char *in,
|
unicode_latin1_to_utf8(char * dst, size_t * outlen, const char *in,
|
||||||
size_t * inlen)
|
size_t * inlen)
|
||||||
{
|
{
|
||||||
int is = (int)*inlen;
|
int is = (int)*inlen;
|
||||||
|
@ -148,15 +150,15 @@ unicode_latin1_to_utf8(utf8_t * dst, size_t * outlen, const char *in,
|
||||||
return (int)*outlen;
|
return (int)*outlen;
|
||||||
}
|
}
|
||||||
|
|
||||||
int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t *b)
|
int unicode_utf8_strcasecmp(const char * a, const char *b)
|
||||||
{
|
{
|
||||||
while (*a && *b) {
|
while (*a && *b) {
|
||||||
int ret;
|
int ret;
|
||||||
size_t size;
|
size_t size;
|
||||||
ucs4_t ucsa = *a, ucsb = *b;
|
wint_t ucsa = *a, ucsb = *b;
|
||||||
|
|
||||||
if (ucsa & 0x80) {
|
if (ucsa & 0x80) {
|
||||||
ret = unicode_utf8_to_ucs4(&ucsa, a, &size);
|
ret = unicode_utf8_decode(&ucsa, a, &size);
|
||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
return -1;
|
return -1;
|
||||||
a += size;
|
a += size;
|
||||||
|
@ -164,7 +166,7 @@ int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t *b)
|
||||||
else
|
else
|
||||||
++a;
|
++a;
|
||||||
if (ucsb & 0x80) {
|
if (ucsb & 0x80) {
|
||||||
ret = unicode_utf8_to_ucs4(&ucsb, b, &size);
|
ret = unicode_utf8_decode(&ucsb, b, &size);
|
||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
return -1;
|
return -1;
|
||||||
b += size;
|
b += size;
|
||||||
|
@ -188,10 +190,10 @@ int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t *b)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Convert a UCS-4 character to UTF-8. */
|
/* Convert a wide character to UTF-8. */
|
||||||
int
|
int
|
||||||
unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size,
|
unicode_utf8_encode(char * utf8_character, size_t * size,
|
||||||
ucs4_t ucs4_character)
|
wint_t ucs4_character)
|
||||||
{
|
{
|
||||||
int utf8_bytes;
|
int utf8_bytes;
|
||||||
|
|
||||||
|
@ -213,6 +215,7 @@ unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size,
|
||||||
utf8_character[1] = (char)(((ucs4_character >> 6) & B00111111) | B10000000);
|
utf8_character[1] = (char)(((ucs4_character >> 6) & B00111111) | B10000000);
|
||||||
utf8_character[2] = (char)((ucs4_character & B00111111) | B10000000);
|
utf8_character[2] = (char)((ucs4_character & B00111111) | B10000000);
|
||||||
}
|
}
|
||||||
|
#if 0
|
||||||
else if (ucs4_character <= 0x001FFFFF) {
|
else if (ucs4_character <= 0x001FFFFF) {
|
||||||
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||||
utf8_bytes = 4;
|
utf8_bytes = 4;
|
||||||
|
@ -246,6 +249,7 @@ unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size,
|
||||||
utf8_character[4] = (char)(((ucs4_character >> 6) & B00111111) | B10000000);
|
utf8_character[4] = (char)(((ucs4_character >> 6) & B00111111) | B10000000);
|
||||||
utf8_character[5] = (char)((ucs4_character & B00111111) | B10000000);
|
utf8_character[5] = (char)((ucs4_character & B00111111) | B10000000);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
else {
|
else {
|
||||||
return EILSEQ;
|
return EILSEQ;
|
||||||
}
|
}
|
||||||
|
@ -257,10 +261,10 @@ unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size,
|
||||||
|
|
||||||
/* Convert a UTF-8 encoded character to UCS-4. */
|
/* Convert a UTF-8 encoded character to UCS-4. */
|
||||||
int
|
int
|
||||||
unicode_utf8_to_ucs4(ucs4_t * ucs4_character, const utf8_t * utf8_string,
|
unicode_utf8_decode(wint_t * ucs4_character, const char * utf8_string,
|
||||||
size_t * length)
|
size_t * length)
|
||||||
{
|
{
|
||||||
utf8_t utf8_character = utf8_string[0];
|
char utf8_character = utf8_string[0];
|
||||||
|
|
||||||
/* Is the character in the ASCII range? If so, just copy it to the
|
/* Is the character in the ASCII range? If so, just copy it to the
|
||||||
output. */
|
output. */
|
||||||
|
@ -361,13 +365,13 @@ unicode_utf8_to_ucs4(ucs4_t * ucs4_character, const utf8_t * utf8_string,
|
||||||
|
|
||||||
/** Convert a UTF-8 encoded character to CP437. */
|
/** Convert a UTF-8 encoded character to CP437. */
|
||||||
int
|
int
|
||||||
unicode_utf8_to_cp437(unsigned char *cp_character, const utf8_t * utf8_string,
|
unicode_utf8_to_cp437(unsigned char *cp_character, const char * utf8_string,
|
||||||
size_t * length)
|
size_t * length)
|
||||||
{
|
{
|
||||||
ucs4_t ucs4_character;
|
wint_t ucs4_character;
|
||||||
int result;
|
int result;
|
||||||
|
|
||||||
result = unicode_utf8_to_ucs4(&ucs4_character, utf8_string, length);
|
result = unicode_utf8_decode(&ucs4_character, utf8_string, length);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
/* pass decoding characters upstream */
|
/* pass decoding characters upstream */
|
||||||
return result;
|
return result;
|
||||||
|
@ -378,7 +382,7 @@ unicode_utf8_to_cp437(unsigned char *cp_character, const utf8_t * utf8_string,
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
struct {
|
struct {
|
||||||
ucs4_t ucs4;
|
wint_t ucs4;
|
||||||
unsigned char cp437;
|
unsigned char cp437;
|
||||||
} xref[160] = {
|
} xref[160] = {
|
||||||
{ 0x00A0, 255 },
|
{ 0x00A0, 255 },
|
||||||
|
@ -566,7 +570,7 @@ unicode_utf8_to_cp437(unsigned char *cp_character, const utf8_t * utf8_string,
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Convert a UTF-8 encoded character to ASCII, with '?' replacements. */
|
/** Convert a UTF-8 encoded character to ASCII, with '?' replacements. */
|
||||||
int unicode_utf8_to_ascii(unsigned char *cp_character, const utf8_t * utf8_string,
|
int unicode_utf8_to_ascii(unsigned char *cp_character, const char * utf8_string,
|
||||||
size_t *length)
|
size_t *length)
|
||||||
{
|
{
|
||||||
int result = unicode_utf8_to_cp437(cp_character, utf8_string, length);
|
int result = unicode_utf8_to_cp437(cp_character, utf8_string, length);
|
||||||
|
@ -579,13 +583,13 @@ int unicode_utf8_to_ascii(unsigned char *cp_character, const utf8_t * utf8_strin
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Convert a UTF-8 encoded character to CP1252. */
|
/** Convert a UTF-8 encoded character to CP1252. */
|
||||||
int unicode_utf8_to_cp1252(unsigned char *cp_character, const utf8_t * utf8_string,
|
int unicode_utf8_to_cp1252(unsigned char *cp_character, const char * utf8_string,
|
||||||
size_t * length)
|
size_t * length)
|
||||||
{
|
{
|
||||||
ucs4_t ucs4_character;
|
wint_t ucs4_character;
|
||||||
int result;
|
int result;
|
||||||
|
|
||||||
result = unicode_utf8_to_ucs4(&ucs4_character, utf8_string, length);
|
result = unicode_utf8_decode(&ucs4_character, utf8_string, length);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
/* pass decoding characters upstream */
|
/* pass decoding characters upstream */
|
||||||
return result;
|
return result;
|
||||||
|
@ -596,7 +600,7 @@ int unicode_utf8_to_cp1252(unsigned char *cp_character, const utf8_t * utf8_stri
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
struct {
|
struct {
|
||||||
ucs4_t ucs4;
|
wint_t ucs4;
|
||||||
unsigned char cp;
|
unsigned char cp;
|
||||||
} xref[] = {
|
} xref[] = {
|
||||||
{ 0x0081, 0x81 },
|
{ 0x0081, 0x81 },
|
||||||
|
|
|
@ -19,30 +19,29 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
#ifndef _UNICODE_H
|
#ifndef _UNICODE_H
|
||||||
#define _UNICODE_H
|
#define _UNICODE_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <wchar.h>
|
|
||||||
#define USE_UNICODE
|
#define USE_UNICODE
|
||||||
typedef long ucs4_t;
|
int unicode_utf8_to_cp437(unsigned char *result, const char * utf8_string,
|
||||||
typedef char utf8_t;
|
|
||||||
|
|
||||||
int unicode_utf8_to_cp437(unsigned char *result, const utf8_t * utf8_string,
|
|
||||||
size_t * length);
|
size_t * length);
|
||||||
int unicode_utf8_to_cp1252(unsigned char *result, const utf8_t * utf8_string,
|
int unicode_utf8_to_cp1252(unsigned char *result, const char * utf8_string,
|
||||||
size_t * length);
|
size_t * length);
|
||||||
int unicode_utf8_to_ucs4(ucs4_t * result, const utf8_t * utf8_string,
|
int unicode_utf8_decode(wint_t * result, const char * utf8_string,
|
||||||
size_t * length);
|
size_t * length);
|
||||||
int unicode_ucs4_to_utf8(utf8_t * result, size_t * size,
|
int unicode_utf8_encode(char * result, size_t * size,
|
||||||
ucs4_t ucs4_character);
|
wint_t ucs4_character);
|
||||||
int unicode_utf8_to_ascii(unsigned char *cp_character, const utf8_t * utf8_string,
|
int unicode_utf8_to_ascii(unsigned char *cp_character, const char * utf8_string,
|
||||||
size_t *length);
|
size_t *length);
|
||||||
int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t * b);
|
int unicode_utf8_strcasecmp(const char * a, const char * b);
|
||||||
int unicode_latin1_to_utf8(utf8_t * out, size_t * outlen,
|
int unicode_latin1_to_utf8(char * out, size_t * outlen,
|
||||||
const char *in, size_t * inlen);
|
const char *in, size_t * inlen);
|
||||||
int unicode_utf8_tolower(utf8_t *op, size_t outlen, const utf8_t *ip);
|
int unicode_utf8_tolower(char *op, size_t outlen, const char *ip);
|
||||||
int unicode_utf8_trim(utf8_t *ip);
|
size_t unicode_utf8_trim(char *ip);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,12 @@
|
||||||
|
#ifdef _MSC_VER
|
||||||
#include <platform.h>
|
#include <platform.h>
|
||||||
#include <CuTest.h>
|
#endif
|
||||||
|
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
|
|
||||||
|
#include <CuTest.h>
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
@ -9,9 +15,33 @@ static void test_unicode_trim(CuTest * tc)
|
||||||
{
|
{
|
||||||
char buffer[32];
|
char buffer[32];
|
||||||
|
|
||||||
strcpy(buffer, "Hello Word");
|
strcpy(buffer, "Hello World");
|
||||||
CuAssertIntEquals(tc, 0, unicode_utf8_trim(buffer));
|
CuAssertIntEquals(tc, 0, unicode_utf8_trim(buffer));
|
||||||
CuAssertStrEquals(tc, "Hello Word", buffer);
|
CuAssertStrEquals(tc, "Hello World", buffer);
|
||||||
|
|
||||||
|
strcpy(buffer, " Hello World");
|
||||||
|
CuAssertIntEquals(tc, 2, unicode_utf8_trim(buffer));
|
||||||
|
CuAssertStrEquals(tc, "Hello World", buffer);
|
||||||
|
|
||||||
|
strcpy(buffer, "Hello World ");
|
||||||
|
CuAssertIntEquals(tc, 2, unicode_utf8_trim(buffer));
|
||||||
|
CuAssertStrEquals(tc, "Hello World", buffer);
|
||||||
|
|
||||||
|
strcpy(buffer, " Hello World ");
|
||||||
|
CuAssertIntEquals(tc, 2, unicode_utf8_trim(buffer));
|
||||||
|
CuAssertStrEquals(tc, "Hello World", buffer);
|
||||||
|
|
||||||
|
strcpy(buffer, "Hello\t\r\nWorld");
|
||||||
|
CuAssertIntEquals(tc, 3, unicode_utf8_trim(buffer));
|
||||||
|
CuAssertStrEquals(tc, "HelloWorld", buffer);
|
||||||
|
|
||||||
|
strcpy(buffer, "LTR");
|
||||||
|
buffer[3] = -30;
|
||||||
|
buffer[4] = -128;
|
||||||
|
buffer[5] = -114;
|
||||||
|
buffer[6] = 0;
|
||||||
|
CuAssertIntEquals(tc, 3, unicode_utf8_trim(buffer));
|
||||||
|
CuAssertStrEquals(tc, "LTR", buffer);
|
||||||
|
|
||||||
strcpy(buffer, " Hello Word ");
|
strcpy(buffer, " Hello Word ");
|
||||||
CuAssertIntEquals(tc, 4, unicode_utf8_trim(buffer));
|
CuAssertIntEquals(tc, 4, unicode_utf8_trim(buffer));
|
||||||
|
@ -48,7 +78,7 @@ static void test_unicode_tolower(CuTest * tc)
|
||||||
static void test_unicode_utf8_to_other(CuTest *tc)
|
static void test_unicode_utf8_to_other(CuTest *tc)
|
||||||
{
|
{
|
||||||
const unsigned char uchar_str[] = { 0xc3, 0x98, 0xc5, 0xb8, 0xc2, 0x9d, 'l', 0 }; /* ØŸl */
|
const unsigned char uchar_str[] = { 0xc3, 0x98, 0xc5, 0xb8, 0xc2, 0x9d, 'l', 0 }; /* ØŸl */
|
||||||
utf8_t *utf8_str = (utf8_t *)uchar_str;
|
char *utf8_str = (char *)uchar_str;
|
||||||
unsigned char ch;
|
unsigned char ch;
|
||||||
size_t sz;
|
size_t sz;
|
||||||
CuAssertIntEquals(tc, 0, unicode_utf8_to_cp437(&ch, utf8_str, &sz));
|
CuAssertIntEquals(tc, 0, unicode_utf8_to_cp437(&ch, utf8_str, &sz));
|
||||||
|
@ -92,27 +122,27 @@ static void test_unicode_utf8_to_other(CuTest *tc)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_unicode_utf8_to_ucs(CuTest *tc) {
|
static void test_unicode_utf8_to_ucs(CuTest *tc) {
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
size_t sz;
|
size_t sz;
|
||||||
|
|
||||||
CuAssertIntEquals(tc, 0, unicode_utf8_to_ucs4(&ucs, "a", &sz));
|
CuAssertIntEquals(tc, 0, unicode_utf8_decode(&wc, "a", &sz));
|
||||||
CuAssertIntEquals(tc, 'a', ucs);
|
CuAssertIntEquals(tc, 'a', wc);
|
||||||
CuAssertIntEquals(tc, 1, sz);
|
CuAssertIntEquals(tc, 1, sz);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_unicode_bug2262(CuTest *tc) {
|
static void test_unicode_bug2262(CuTest *tc) {
|
||||||
char name[7];
|
char name[7];
|
||||||
ucs4_t ucs;
|
wint_t wc;
|
||||||
size_t sz;
|
size_t sz;
|
||||||
|
|
||||||
strcpy(name, "utende");
|
strcpy(name, "utende");
|
||||||
CuAssertIntEquals(tc, 0, unicode_utf8_to_ucs4(&ucs, name, &sz));
|
CuAssertIntEquals(tc, 0, unicode_utf8_decode(&wc, name, &sz));
|
||||||
CuAssertIntEquals(tc, 1, sz);
|
CuAssertIntEquals(tc, 1, sz);
|
||||||
CuAssertIntEquals(tc, 'u', ucs);
|
CuAssertIntEquals(tc, 'u', wc);
|
||||||
CuAssertIntEquals(tc, 0, unicode_utf8_trim(name));
|
CuAssertIntEquals(tc, 0, unicode_utf8_trim(name));
|
||||||
|
|
||||||
name[0] = -4; /* latin1: ü should fail to decode */
|
name[0] = -4; /* latin1: ü should fail to decode */
|
||||||
CuAssertIntEquals(tc, EILSEQ, unicode_utf8_to_ucs4(&ucs, name, &sz));
|
CuAssertIntEquals(tc, EILSEQ, unicode_utf8_decode(&wc, name, &sz));
|
||||||
CuAssertIntEquals(tc, EILSEQ, unicode_utf8_trim(name));
|
CuAssertIntEquals(tc, EILSEQ, unicode_utf8_trim(name));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,26 +153,36 @@ static void test_unicode_compare(CuTest *tc)
|
||||||
CuAssertIntEquals(tc, 1, unicode_utf8_strcasecmp("bacdefg123", "ABCDEFG123"));
|
CuAssertIntEquals(tc, 1, unicode_utf8_strcasecmp("bacdefg123", "ABCDEFG123"));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_unicode_farsi_nzwj(CuTest *tc) {
|
static void test_unicode_trim_zwnj(CuTest *tc) {
|
||||||
const char str[] = { 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd9, 0x84, 0xd8, 0xaf,
|
const char zwnj[] = { 0xe2, 0x80, 0x8c, 0x00 };
|
||||||
0xdb, 0x8c, 0xd9, 0x86, 0x20, 0xd9, 0x85, 0xd8, 0xad, 0xd9, 0x85, 0xd8,
|
|
||||||
0xaf, 0x20, 0xd8, 0xb1, 0xd9, 0x88, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80,
|
|
||||||
0x8e, 0xe2, 0x80, 0x8e, 0x00 };
|
|
||||||
char name[64];
|
char name[64];
|
||||||
strcpy(name, str);
|
char expect[64];
|
||||||
CuAssertIntEquals(tc, 0, unicode_utf8_trim(name));
|
snprintf(name, sizeof(name), "%sA%sB%s ", zwnj, zwnj, zwnj);
|
||||||
CuAssertStrEquals(tc, str, name);
|
snprintf(expect, sizeof(expect), "A%sB", zwnj);
|
||||||
|
CuAssertIntEquals(tc, 8, unicode_utf8_trim(name));
|
||||||
|
CuAssertStrEquals(tc, expect, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void test_unicode_trim_ltrm(CuTest *tc) {
|
||||||
|
const char ltrm[] = { 0xe2, 0x80, 0x8e, 0x00 };
|
||||||
|
char name[64];
|
||||||
|
char expect[64];
|
||||||
|
snprintf(name, sizeof(name), "%sBrot%szeit%s ", ltrm, ltrm, ltrm);
|
||||||
|
snprintf(expect, sizeof(expect), "Brot%szeit", ltrm);
|
||||||
|
CuAssertIntEquals(tc, 8, unicode_utf8_trim(name));
|
||||||
|
CuAssertStrEquals(tc, expect, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
CuSuite *get_unicode_suite(void)
|
CuSuite *get_unicode_suite(void)
|
||||||
{
|
{
|
||||||
CuSuite *suite = CuSuiteNew();
|
CuSuite *suite = CuSuiteNew();
|
||||||
SUITE_ADD_TEST(suite, test_unicode_bug2262);
|
|
||||||
SUITE_ADD_TEST(suite, test_unicode_tolower);
|
|
||||||
SUITE_ADD_TEST(suite, test_unicode_trim);
|
SUITE_ADD_TEST(suite, test_unicode_trim);
|
||||||
|
SUITE_ADD_TEST(suite, test_unicode_trim_zwnj);
|
||||||
|
SUITE_ADD_TEST(suite, test_unicode_trim_ltrm);
|
||||||
SUITE_ADD_TEST(suite, test_unicode_utf8_to_other);
|
SUITE_ADD_TEST(suite, test_unicode_utf8_to_other);
|
||||||
SUITE_ADD_TEST(suite, test_unicode_utf8_to_ucs);
|
SUITE_ADD_TEST(suite, test_unicode_utf8_to_ucs);
|
||||||
SUITE_ADD_TEST(suite, test_unicode_compare);
|
SUITE_ADD_TEST(suite, test_unicode_compare);
|
||||||
SUITE_ADD_TEST(suite, test_unicode_farsi_nzwj);
|
SUITE_ADD_TEST(suite, test_unicode_bug2262);
|
||||||
|
SUITE_ADD_TEST(suite, test_unicode_tolower);
|
||||||
return suite;
|
return suite;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue