From 55299346f8e43a433db44b1ccb0a8f10a98fac2e Mon Sep 17 00:00:00 2001 From: Enno Rehling Date: Thu, 1 Aug 2019 18:40:42 +0200 Subject: [PATCH 1/2] use wint_t, not long. --- src/summary.c | 6 +-- src/util/filereader.c | 14 +++---- src/util/parser.c | 22 +++++------ src/util/umlaut.c | 30 +++++++-------- src/util/unicode.c | 68 +++++++++++++++++---------------- src/util/unicode.h | 27 +++++++------ src/util/unicode.test.c | 84 ++++++++++++++++++++++++++++++----------- 7 files changed, 147 insertions(+), 104 deletions(-) diff --git a/src/summary.c b/src/summary.c index 06dabba90..1e3ddbedf 100644 --- a/src/summary.c +++ b/src/summary.c @@ -177,11 +177,11 @@ static int count_umlaut(const char *s) int result = 0; const char *cp; for (cp = s; *cp; ++cp) { - ucs4_t ucs = *cp; - if (ucs & 0x80) { + wint_t wc = *cp; + if (wc & 0x80) { size_t size; int err; - err = unicode_utf8_to_ucs4(&ucs, cp, &size); + err = unicode_utf8_decode(&wc, cp, &size); if (err != 0) { log_error("illegal utf8 encoding %s at %s", s, cp); return result; diff --git a/src/util/filereader.c b/src/util/filereader.c index c4a3b56cc..5973c8f1c 100644 --- a/src/util/filereader.c +++ b/src/util/filereader.c @@ -26,12 +26,12 @@ static int eatwhite(const char *ptr, size_t * total_size) *total_size = 0; while (*ptr) { - ucs4_t ucs; + wint_t wc; size_t size = 0; - ret = unicode_utf8_to_ucs4(&ucs, ptr, &size); + ret = unicode_utf8_decode(&wc, ptr, &size); if (ret != 0) break; - if (!iswspace((wint_t)ucs)) + if (!iswspace(wc)) break; *total_size += size; ptr += size; @@ -86,7 +86,7 @@ static const char *getbuf_utf8(FILE * F) } cont = false; while (*bp && cp < fbuf + MAXLINE) { - ucs4_t ucs; + wint_t wc; size_t size; int ret; @@ -119,14 +119,14 @@ static const char *getbuf_utf8(FILE * F) } } - ret = unicode_utf8_to_ucs4(&ucs, bp, &size); + ret = unicode_utf8_decode(&wc, bp, &size); if (ret != 0) { unicode_warning(bp); break; } - if (iswspace((wint_t)ucs)) { + if (iswspace(wc)) { if (!quote) { bp += size; ret = eatwhite(bp, &size); @@ -151,7 +151,7 @@ static const char *getbuf_utf8(FILE * F) bp += size; } } - else if (iswcntrl((wint_t)ucs)) { + else if (iswcntrl(wc)) { if (!comment && cp < fbuf + MAXLINE) { *cp++ = '?'; } diff --git a/src/util/parser.c b/src/util/parser.c index bd187b5f8..e66e2b724 100644 --- a/src/util/parser.c +++ b/src/util/parser.c @@ -27,7 +27,7 @@ static parse_state *states; static int eatwhitespace_c(const char **str_p) { int ret = 0; - ucs4_t ucs; + wint_t wc; size_t len; const char *str = *str_p; @@ -40,12 +40,12 @@ static int eatwhitespace_c(const char **str_p) ++str; } else { - ret = unicode_utf8_to_ucs4(&ucs, str, &len); + ret = unicode_utf8_decode(&wc, str, &len); if (ret != 0) { log_warning("illegal character sequence in UTF8 string: %s\n", str); break; } - if (!iswspace((wint_t)ucs)) + if (!iswspace(wc)) break; str += len; } @@ -106,16 +106,16 @@ void skip_token(void) eatwhitespace_c(&states->current_token); while (*states->current_token) { - ucs4_t ucs; + wint_t wc; size_t len; unsigned char utf8_character = (unsigned char)states->current_token[0]; if (~utf8_character & 0x80) { - ucs = utf8_character; + wc = utf8_character; ++states->current_token; } else { - int ret = unicode_utf8_to_ucs4(&ucs, states->current_token, &len); + int ret = unicode_utf8_decode(&wc, states->current_token, &len); if (ret == 0) { states->current_token += len; } @@ -123,7 +123,7 @@ void skip_token(void) log_warning("illegal character sequence in UTF8 string: %s\n", states->current_token); } } - if (iswspace((wint_t)ucs) && quotechar == 0) { + if (iswspace(wc) && quotechar == 0) { return; } else { @@ -160,17 +160,17 @@ char *parse_token(const char **str, char *lbuf, size_t buflen) return 0; } while (*ctoken) { - ucs4_t ucs; + wint_t wc; size_t len; bool copy = false; unsigned char utf8_character = *(unsigned char *)ctoken; if (~utf8_character & 0x80) { - ucs = utf8_character; + wc = utf8_character; len = 1; } else { - int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len); + int ret = unicode_utf8_decode(&wc, ctoken, &len); if (ret != 0) { log_warning("illegal character sequence in UTF8 string: %s\n", ctoken); break; @@ -180,7 +180,7 @@ char *parse_token(const char **str, char *lbuf, size_t buflen) copy = true; escape = false; } - else if (iswspace((wint_t)ucs)) { + else if (iswspace(wc)) { if (quotechar == 0) break; copy = true; diff --git a/src/util/umlaut.c b/src/util/umlaut.c index 5f135df69..33facc0aa 100644 --- a/src/util/umlaut.c +++ b/src/util/umlaut.c @@ -32,7 +32,7 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. typedef struct tref { struct tref *nexthash; - ucs4_t ucs; + wint_t wc; struct tnode *node; } tref; @@ -99,8 +99,8 @@ char * transliterate(char * out, size_t size, const char * in) size -= advance; } else { - ucs4_t ucs; - int ret = unicode_utf8_to_ucs4(&ucs, src, &len); + wint_t wc; + int ret = unicode_utf8_decode(&wc, src, &len); if (ret != 0) { /* encoding is broken. yikes */ log_error("transliterate | encoding error in '%s'\n", src); @@ -127,7 +127,7 @@ void addtoken(tnode ** root, const char *str, variant id) { tnode * tk; static const struct replace { - ucs4_t ucs; + wint_t wc; const char str[3]; } replace[] = { /* match lower-case (!) umlauts and others to transcriptions */ @@ -150,10 +150,10 @@ void addtoken(tnode ** root, const char *str, variant id) else { tref *next; int ret, index, i = 0; - ucs4_t ucs, lcs; + wint_t ucs, lcs; size_t len; - ret = unicode_utf8_to_ucs4(&ucs, str, &len); + ret = unicode_utf8_decode(&ucs, str, &len); assert(ret == 0 || !"invalid utf8 string"); lcs = ucs; @@ -166,7 +166,7 @@ void addtoken(tnode ** root, const char *str, variant id) next = tk->next[index]; if (!(tk->flags & LEAF)) tk->id = id; - while (next && next->ucs != ucs) + while (next && next->wc != ucs) next = next->nexthash; if (!next) { tref *ref; @@ -181,7 +181,7 @@ void addtoken(tnode ** root, const char *str, variant id) ref = (tref *)malloc(sizeof(tref)); if (!ref) abort(); - ref->ucs = ucs; + ref->wc = ucs; ref->node = node; ref->nexthash = tk->next[index]; tk->next[index] = ref; @@ -195,7 +195,7 @@ void addtoken(tnode ** root, const char *str, variant id) #endif ref = (tref *)malloc(sizeof(tref)); assert_alloc(ref); - ref->ucs = lcs; + ref->wc = lcs; ref->node = node; ++node->refcount; ref->nexthash = tk->next[index]; @@ -211,7 +211,7 @@ void addtoken(tnode ** root, const char *str, variant id) } addtoken(&next->node, str + len, id); while (replace[i].str[0]) { - if (lcs == replace[i].ucs) { + if (lcs == replace[i].wc) { char zText[1024]; memcpy(zText, replace[i].str, 3); str_strlcpy(zText + 2, (const char *)str + len, sizeof(zText)-2); @@ -255,9 +255,9 @@ int findtoken(const void * root, const char *key, variant * result) do { int index; const tref *ref; - ucs4_t ucs; + wint_t wc; size_t len; - int ret = unicode_utf8_to_ucs4(&ucs, str, &len); + int ret = unicode_utf8_decode(&wc, str, &len); if (ret != 0) { /* encoding is broken. youch */ @@ -265,12 +265,12 @@ int findtoken(const void * root, const char *key, variant * result) return E_TOK_NOMATCH; } #if NODEHASHSIZE == 8 - index = ucs & 7; + index = wc & 7; #else - index = ucs % NODEHASHSIZE; + index = wc % NODEHASHSIZE; #endif ref = tk->next[index]; - while (ref && ref->ucs != ucs) + while (ref && ref->wc != wc) ref = ref->nexthash; str += len; if (!ref) { diff --git a/src/util/unicode.c b/src/util/unicode.c index b4bb803dd..e4e7b0d61 100644 --- a/src/util/unicode.c +++ b/src/util/unicode.c @@ -33,18 +33,18 @@ #define B00000011 0x03 #define B00000001 0x01 -int unicode_utf8_trim(utf8_t *buf) +size_t unicode_utf8_trim(char *buf) { int result = 0, ts = 0; - utf8_t *op = buf, *ip = buf, *lc = buf; + char *op = buf, *ip = buf, *lc = buf; assert(buf); while (*ip) { size_t size = 1; wint_t wc = *ip; if (wc & 0x80) { - ucs4_t ucs = 0; + wint_t ucs = 0; if (ip[1]) { - int ret = unicode_utf8_to_ucs4(&ucs, ip, &size); + int ret = unicode_utf8_decode(&ucs, ip, &size); if (ret != 0) { return ret; } @@ -56,22 +56,24 @@ int unicode_utf8_trim(utf8_t *buf) ++result; } } - if (op == buf && iswspace(wc)) { - ++result; + if (op == buf && (iswcntrl(wc) || iswspace(wc))) { + result += size; } else if (wc>255 || !iscntrl(wc)) { if (op != ip) { memmove(op, ip, size); } op += size; - if (iswspace(wc)) ++ts; + if (iswcntrl(wc) || iswspace(wc)) { + ts += size; + } else { lc = op; ts = 0; } } else { - ++result; + result += size; } ip += size; } @@ -79,15 +81,15 @@ int unicode_utf8_trim(utf8_t *buf) return result + ts; } -int unicode_utf8_tolower(utf8_t * op, size_t outlen, const utf8_t * ip) +int unicode_utf8_tolower(char * op, size_t outlen, const char * ip) { while (*ip) { - ucs4_t ucs = *ip; - ucs4_t low; + wint_t ucs = *ip; + wint_t low; size_t size = 1; if (ucs & 0x80) { - int ret = unicode_utf8_to_ucs4(&ucs, ip, &size); + int ret = unicode_utf8_decode(&ucs, ip, &size); if (ret != 0) { return ret; } @@ -104,7 +106,7 @@ int unicode_utf8_tolower(utf8_t * op, size_t outlen, const utf8_t * ip) } else { ip += size; - unicode_ucs4_to_utf8(op, &size, low); + unicode_utf8_encode(op, &size, low); op += size; outlen -= size; } @@ -114,7 +116,7 @@ int unicode_utf8_tolower(utf8_t * op, size_t outlen, const utf8_t * ip) } int -unicode_latin1_to_utf8(utf8_t * dst, size_t * outlen, const char *in, +unicode_latin1_to_utf8(char * dst, size_t * outlen, const char *in, size_t * inlen) { int is = (int)*inlen; @@ -148,15 +150,15 @@ unicode_latin1_to_utf8(utf8_t * dst, size_t * outlen, const char *in, return (int)*outlen; } -int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t *b) +int unicode_utf8_strcasecmp(const char * a, const char *b) { while (*a && *b) { int ret; size_t size; - ucs4_t ucsa = *a, ucsb = *b; + wint_t ucsa = *a, ucsb = *b; if (ucsa & 0x80) { - ret = unicode_utf8_to_ucs4(&ucsa, a, &size); + ret = unicode_utf8_decode(&ucsa, a, &size); if (ret != 0) return -1; a += size; @@ -164,7 +166,7 @@ int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t *b) else ++a; if (ucsb & 0x80) { - ret = unicode_utf8_to_ucs4(&ucsb, b, &size); + ret = unicode_utf8_decode(&ucsb, b, &size); if (ret != 0) return -1; b += size; @@ -188,10 +190,10 @@ int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t *b) return 0; } -/* Convert a UCS-4 character to UTF-8. */ +/* Convert a wide character to UTF-8. */ int -unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size, - ucs4_t ucs4_character) +unicode_utf8_encode(char * utf8_character, size_t * size, + wint_t ucs4_character) { int utf8_bytes; @@ -213,6 +215,7 @@ unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size, utf8_character[1] = (char)(((ucs4_character >> 6) & B00111111) | B10000000); utf8_character[2] = (char)((ucs4_character & B00111111) | B10000000); } +#if 0 else if (ucs4_character <= 0x001FFFFF) { /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ utf8_bytes = 4; @@ -246,6 +249,7 @@ unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size, utf8_character[4] = (char)(((ucs4_character >> 6) & B00111111) | B10000000); utf8_character[5] = (char)((ucs4_character & B00111111) | B10000000); } +#endif else { return EILSEQ; } @@ -257,10 +261,10 @@ unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size, /* Convert a UTF-8 encoded character to UCS-4. */ int -unicode_utf8_to_ucs4(ucs4_t * ucs4_character, const utf8_t * utf8_string, +unicode_utf8_decode(wint_t * ucs4_character, const char * utf8_string, size_t * length) { - utf8_t utf8_character = utf8_string[0]; + char utf8_character = utf8_string[0]; /* Is the character in the ASCII range? If so, just copy it to the output. */ @@ -361,13 +365,13 @@ unicode_utf8_to_ucs4(ucs4_t * ucs4_character, const utf8_t * utf8_string, /** Convert a UTF-8 encoded character to CP437. */ int -unicode_utf8_to_cp437(unsigned char *cp_character, const utf8_t * utf8_string, +unicode_utf8_to_cp437(unsigned char *cp_character, const char * utf8_string, size_t * length) { - ucs4_t ucs4_character; + wint_t ucs4_character; int result; - result = unicode_utf8_to_ucs4(&ucs4_character, utf8_string, length); + result = unicode_utf8_decode(&ucs4_character, utf8_string, length); if (result != 0) { /* pass decoding characters upstream */ return result; @@ -378,7 +382,7 @@ unicode_utf8_to_cp437(unsigned char *cp_character, const utf8_t * utf8_string, } else { struct { - ucs4_t ucs4; + wint_t ucs4; unsigned char cp437; } xref[160] = { { 0x00A0, 255 }, @@ -566,7 +570,7 @@ unicode_utf8_to_cp437(unsigned char *cp_character, const utf8_t * utf8_string, } /** Convert a UTF-8 encoded character to ASCII, with '?' replacements. */ -int unicode_utf8_to_ascii(unsigned char *cp_character, const utf8_t * utf8_string, +int unicode_utf8_to_ascii(unsigned char *cp_character, const char * utf8_string, size_t *length) { int result = unicode_utf8_to_cp437(cp_character, utf8_string, length); @@ -579,13 +583,13 @@ int unicode_utf8_to_ascii(unsigned char *cp_character, const utf8_t * utf8_strin } /** Convert a UTF-8 encoded character to CP1252. */ -int unicode_utf8_to_cp1252(unsigned char *cp_character, const utf8_t * utf8_string, +int unicode_utf8_to_cp1252(unsigned char *cp_character, const char * utf8_string, size_t * length) { - ucs4_t ucs4_character; + wint_t ucs4_character; int result; - result = unicode_utf8_to_ucs4(&ucs4_character, utf8_string, length); + result = unicode_utf8_decode(&ucs4_character, utf8_string, length); if (result != 0) { /* pass decoding characters upstream */ return result; @@ -596,7 +600,7 @@ int unicode_utf8_to_cp1252(unsigned char *cp_character, const utf8_t * utf8_stri } else { struct { - ucs4_t ucs4; + wint_t ucs4; unsigned char cp; } xref[] = { { 0x0081, 0x81 }, diff --git a/src/util/unicode.h b/src/util/unicode.h index 4fd860e45..6ebcc4794 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -19,30 +19,29 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #ifndef _UNICODE_H #define _UNICODE_H +#include +#include + #ifdef __cplusplus extern "C" { #endif -#include #define USE_UNICODE - typedef long ucs4_t; - typedef char utf8_t; - - int unicode_utf8_to_cp437(unsigned char *result, const utf8_t * utf8_string, + int unicode_utf8_to_cp437(unsigned char *result, const char * utf8_string, size_t * length); - int unicode_utf8_to_cp1252(unsigned char *result, const utf8_t * utf8_string, + int unicode_utf8_to_cp1252(unsigned char *result, const char * utf8_string, size_t * length); - int unicode_utf8_to_ucs4(ucs4_t * result, const utf8_t * utf8_string, + int unicode_utf8_decode(wint_t * result, const char * utf8_string, size_t * length); - int unicode_ucs4_to_utf8(utf8_t * result, size_t * size, - ucs4_t ucs4_character); - int unicode_utf8_to_ascii(unsigned char *cp_character, const utf8_t * utf8_string, + int unicode_utf8_encode(char * result, size_t * size, + wint_t ucs4_character); + int unicode_utf8_to_ascii(unsigned char *cp_character, const char * utf8_string, size_t *length); - int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t * b); - int unicode_latin1_to_utf8(utf8_t * out, size_t * outlen, + int unicode_utf8_strcasecmp(const char * a, const char * b); + int unicode_latin1_to_utf8(char * out, size_t * outlen, const char *in, size_t * inlen); - int unicode_utf8_tolower(utf8_t *op, size_t outlen, const utf8_t *ip); - int unicode_utf8_trim(utf8_t *ip); + int unicode_utf8_tolower(char *op, size_t outlen, const char *ip); + size_t unicode_utf8_trim(char *ip); #ifdef __cplusplus } diff --git a/src/util/unicode.test.c b/src/util/unicode.test.c index 7cada9da6..17c453296 100644 --- a/src/util/unicode.test.c +++ b/src/util/unicode.test.c @@ -1,6 +1,12 @@ +#ifdef _MSC_VER #include -#include +#endif + #include "unicode.h" + +#include + +#include #include #include #include @@ -9,9 +15,33 @@ static void test_unicode_trim(CuTest * tc) { char buffer[32]; - strcpy(buffer, "Hello Word"); + strcpy(buffer, "Hello World"); CuAssertIntEquals(tc, 0, unicode_utf8_trim(buffer)); - CuAssertStrEquals(tc, "Hello Word", buffer); + CuAssertStrEquals(tc, "Hello World", buffer); + + strcpy(buffer, " Hello World"); + CuAssertIntEquals(tc, 2, unicode_utf8_trim(buffer)); + CuAssertStrEquals(tc, "Hello World", buffer); + + strcpy(buffer, "Hello World "); + CuAssertIntEquals(tc, 2, unicode_utf8_trim(buffer)); + CuAssertStrEquals(tc, "Hello World", buffer); + + strcpy(buffer, " Hello World "); + CuAssertIntEquals(tc, 2, unicode_utf8_trim(buffer)); + CuAssertStrEquals(tc, "Hello World", buffer); + + strcpy(buffer, "Hello\t\r\nWorld"); + CuAssertIntEquals(tc, 3, unicode_utf8_trim(buffer)); + CuAssertStrEquals(tc, "HelloWorld", buffer); + + strcpy(buffer, "LTR"); + buffer[3] = -30; + buffer[4] = -128; + buffer[5] = -114; + buffer[6] = 0; + CuAssertIntEquals(tc, 3, unicode_utf8_trim(buffer)); + CuAssertStrEquals(tc, "LTR", buffer); strcpy(buffer, " Hello Word "); CuAssertIntEquals(tc, 4, unicode_utf8_trim(buffer)); @@ -48,7 +78,7 @@ static void test_unicode_tolower(CuTest * tc) static void test_unicode_utf8_to_other(CuTest *tc) { const unsigned char uchar_str[] = { 0xc3, 0x98, 0xc5, 0xb8, 0xc2, 0x9d, 'l', 0 }; /* ØŸl */ - utf8_t *utf8_str = (utf8_t *)uchar_str; + char *utf8_str = (char *)uchar_str; unsigned char ch; size_t sz; CuAssertIntEquals(tc, 0, unicode_utf8_to_cp437(&ch, utf8_str, &sz)); @@ -92,27 +122,27 @@ static void test_unicode_utf8_to_other(CuTest *tc) } static void test_unicode_utf8_to_ucs(CuTest *tc) { - ucs4_t ucs; + wint_t wc; size_t sz; - CuAssertIntEquals(tc, 0, unicode_utf8_to_ucs4(&ucs, "a", &sz)); - CuAssertIntEquals(tc, 'a', ucs); + CuAssertIntEquals(tc, 0, unicode_utf8_decode(&wc, "a", &sz)); + CuAssertIntEquals(tc, 'a', wc); CuAssertIntEquals(tc, 1, sz); } static void test_unicode_bug2262(CuTest *tc) { char name[7]; - ucs4_t ucs; + wint_t wc; size_t sz; strcpy(name, "utende"); - CuAssertIntEquals(tc, 0, unicode_utf8_to_ucs4(&ucs, name, &sz)); + CuAssertIntEquals(tc, 0, unicode_utf8_decode(&wc, name, &sz)); CuAssertIntEquals(tc, 1, sz); - CuAssertIntEquals(tc, 'u', ucs); + CuAssertIntEquals(tc, 'u', wc); CuAssertIntEquals(tc, 0, unicode_utf8_trim(name)); name[0] = -4; /* latin1: ü should fail to decode */ - CuAssertIntEquals(tc, EILSEQ, unicode_utf8_to_ucs4(&ucs, name, &sz)); + CuAssertIntEquals(tc, EILSEQ, unicode_utf8_decode(&wc, name, &sz)); CuAssertIntEquals(tc, EILSEQ, unicode_utf8_trim(name)); } @@ -123,26 +153,36 @@ static void test_unicode_compare(CuTest *tc) CuAssertIntEquals(tc, 1, unicode_utf8_strcasecmp("bacdefg123", "ABCDEFG123")); } -static void test_unicode_farsi_nzwj(CuTest *tc) { - const char str[] = { 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd9, 0x84, 0xd8, 0xaf, - 0xdb, 0x8c, 0xd9, 0x86, 0x20, 0xd9, 0x85, 0xd8, 0xad, 0xd9, 0x85, 0xd8, - 0xaf, 0x20, 0xd8, 0xb1, 0xd9, 0x88, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, - 0x8e, 0xe2, 0x80, 0x8e, 0x00 }; +static void test_unicode_trim_zwnj(CuTest *tc) { + const char zwnj[] = { 0xe2, 0x80, 0x8c, 0x00 }; char name[64]; - strcpy(name, str); - CuAssertIntEquals(tc, 0, unicode_utf8_trim(name)); - CuAssertStrEquals(tc, str, name); + char expect[64]; + snprintf(name, sizeof(name), "%sA%sB%s ", zwnj, zwnj, zwnj); + snprintf(expect, sizeof(expect), "A%sB", zwnj); + CuAssertIntEquals(tc, 8, unicode_utf8_trim(name)); + CuAssertStrEquals(tc, expect, name); +} + +static void test_unicode_trim_ltrm(CuTest *tc) { + const char ltrm[] = { 0xe2, 0x80, 0x8e, 0x00 }; + char name[64]; + char expect[64]; + snprintf(name, sizeof(name), "%sBrot%szeit%s ", ltrm, ltrm, ltrm); + snprintf(expect, sizeof(expect), "Brot%szeit", ltrm); + CuAssertIntEquals(tc, 8, unicode_utf8_trim(name)); + CuAssertStrEquals(tc, expect, name); } CuSuite *get_unicode_suite(void) { CuSuite *suite = CuSuiteNew(); - SUITE_ADD_TEST(suite, test_unicode_bug2262); - SUITE_ADD_TEST(suite, test_unicode_tolower); SUITE_ADD_TEST(suite, test_unicode_trim); + SUITE_ADD_TEST(suite, test_unicode_trim_zwnj); + SUITE_ADD_TEST(suite, test_unicode_trim_ltrm); SUITE_ADD_TEST(suite, test_unicode_utf8_to_other); SUITE_ADD_TEST(suite, test_unicode_utf8_to_ucs); SUITE_ADD_TEST(suite, test_unicode_compare); - SUITE_ADD_TEST(suite, test_unicode_farsi_nzwj); + SUITE_ADD_TEST(suite, test_unicode_bug2262); + SUITE_ADD_TEST(suite, test_unicode_tolower); return suite; } From 964a0586dcc2885fb13242e0bcf5b2e1671df1c8 Mon Sep 17 00:00:00 2001 From: Enno Rehling Date: Thu, 1 Aug 2019 22:58:20 +0200 Subject: [PATCH 2/2] fix UTF8 trimming --- src/util/unicode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/unicode.c b/src/util/unicode.c index e4e7b0d61..cbc2db7cc 100644 --- a/src/util/unicode.c +++ b/src/util/unicode.c @@ -40,7 +40,7 @@ size_t unicode_utf8_trim(char *buf) assert(buf); while (*ip) { size_t size = 1; - wint_t wc = *ip; + wint_t wc = *(unsigned char *)ip; if (wc & 0x80) { wint_t ucs = 0; if (ip[1]) {