From 103a946f413af04ff135caa7638188fd497f3388 Mon Sep 17 00:00:00 2001 From: Enno Rehling Date: Sun, 2 Oct 2016 13:12:47 +0200 Subject: [PATCH] fix endless loop in cp1252 search (wrong sort) fix indentation additional testing --- src/gmtool.c | 4 +- src/util/log.c | 8 +- src/util/unicode.c | 408 ++++++++++++++++++++-------------------- src/util/unicode.h | 6 +- src/util/unicode.test.c | 43 ++++- 5 files changed, 251 insertions(+), 218 deletions(-) diff --git a/src/gmtool.c b/src/gmtool.c index 7a87e4ae6..9533ccab8 100644 --- a/src/gmtool.c +++ b/src/gmtool.c @@ -88,7 +88,7 @@ static void unicode_remove_diacritics(const char *rp, char *wp) { while (*rp) { if (gm_codepage >=0 && *rp & 0x80) { size_t sz = 0; - char ch; + unsigned char ch; switch (gm_codepage) { case 1252: unicode_utf8_to_cp1252(&ch, rp, &sz); @@ -101,7 +101,7 @@ static void unicode_remove_diacritics(const char *rp, char *wp) { break; } rp += sz; - *wp++ = ch; + *wp++ = (char)ch; } else { *wp++ = *rp++; diff --git a/src/util/log.c b/src/util/log.c index d330a96db..1d3724c50 100644 --- a/src/util/log.c +++ b/src/util/log.c @@ -64,11 +64,11 @@ void log_destroy(log_t *handle) { #define LOG_MAXBACKUPS 5 static int -cp_convert(const char *format, char *buffer, size_t length, int codepage) +cp_convert(const char *format, unsigned char *buffer, size_t length, int codepage) { /* when console output on MSDOS, convert to codepage */ const char *input = format; - char *pos = buffer; + unsigned char *pos = buffer; while (pos + 1 < buffer + length && *input) { size_t size = 0; @@ -156,10 +156,10 @@ static void _log_write(FILE * stream, int codepage, const char *format, va_list { if (codepage) { char buffer[MAXLENGTH]; - char converted[MAXLENGTH]; + unsigned char converted[MAXLENGTH]; vsnprintf(buffer, sizeof(buffer), format, args); if (cp_convert(buffer, converted, MAXLENGTH, codepage) == 0) { - fputs(converted, stream); + fputs((char *)converted, stream); } else { /* fall back to non-converted output */ diff --git a/src/util/unicode.c b/src/util/unicode.c index ffc170cc9..f7ea2653b 100644 --- a/src/util/unicode.c +++ b/src/util/unicode.c @@ -1,4 +1,4 @@ -/* +/* * +-------------------+ Christian Schlittchen * | | Enno Rehling * | Eressea PBEM host | Katja Zedel @@ -71,7 +71,7 @@ int unicode_utf8_tolower(utf8_t * op, size_t outlen, const utf8_t * ip) int unicode_latin1_to_utf8(utf8_t * dst, size_t * outlen, const char *in, -size_t * inlen) + size_t * inlen) { int is = (int)*inlen; int os = (int)*outlen; @@ -104,7 +104,7 @@ size_t * inlen) return (int)*outlen; } -int unicode_utf8_strcasecmp(const utf8_t * a, const char *b) +int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t *b) { while (*a && *b) { int ret; @@ -147,7 +147,7 @@ int unicode_utf8_strcasecmp(const utf8_t * a, const char *b) /* Convert a UCS-4 character to UTF-8. */ int unicode_ucs4_to_utf8(utf8_t * utf8_character, size_t * size, -ucs4_t ucs4_character) + ucs4_t ucs4_character) { int utf8_bytes; @@ -214,7 +214,7 @@ ucs4_t ucs4_character) /* Convert a UTF-8 encoded character to UCS-4. */ int unicode_utf8_to_ucs4(ucs4_t * ucs4_character, const utf8_t * utf8_string, -size_t * length) + size_t * length) { utf8_t utf8_character = utf8_string[0]; @@ -317,8 +317,8 @@ size_t * length) /** Convert a UTF-8 encoded character to CP437. */ int -unicode_utf8_to_cp437(char *cp_character, const utf8_t * utf8_string, -size_t * length) +unicode_utf8_to_cp437(unsigned char *cp_character, const utf8_t * utf8_string, + size_t * length) { ucs4_t ucs4_character; int result; @@ -330,174 +330,173 @@ size_t * length) } if (ucs4_character < 0x7F) { - *cp_character = (char)ucs4_character; + *cp_character = (unsigned char)ucs4_character; } else { struct { ucs4_t ucs4; unsigned char cp437; } xref[160] = { - { - 0x00A0, 255 }, { - 0x00A1, 173 }, { - 0x00A2, 155 }, { - 0x00A3, 156 }, { - 0x00A5, 157 }, { - 0x00A7, 21 }, { - 0x00AA, 166 }, { - 0x00AB, 174 }, { - 0x00AC, 170 }, { - 0x00B0, 248 }, { - 0x00B1, 241 }, { - 0x00B2, 253 }, { - 0x00B5, 230 }, { - 0x00B6, 20 }, { - 0x00B7, 250 }, { - 0x00BA, 167 }, { - 0x00BB, 175 }, { - 0x00BC, 172 }, { - 0x00BD, 171 }, { - 0x00BF, 168 }, { - 0x00C4, 142 }, { - 0x00C5, 143 }, { - 0x00C6, 146 }, { - 0x00C7, 128 }, { - 0x00C9, 144 }, { - 0x00D1, 165 }, { - 0x00D6, 153 }, { - 0x00DC, 154 }, { - 0x00DF, 225 }, { - 0x00E0, 133 }, { - 0x00E1, 160 }, { - 0x00E2, 131 }, { - 0x00E4, 132 }, { - 0x00E5, 134 }, { - 0x00E6, 145 }, { - 0x00E7, 135 }, { - 0x00E8, 138 }, { - 0x00E9, 130 }, { - 0x00EA, 136 }, { - 0x00EB, 137 }, { - 0x00EC, 141 }, { - 0x00ED, 161 }, { - 0x00EE, 140 }, { - 0x00EF, 139 }, { - 0x00F1, 164 }, { - 0x00F2, 149 }, { - 0x00F3, 162 }, { - 0x00F4, 147 }, { - 0x00F6, 148 }, { - 0x00F7, 246 }, { - 0x00F9, 151 }, { - 0x00FA, 163 }, { - 0x00FB, 150 }, { - 0x00FC, 129 }, { - 0x00FF, 152 }, { - 0x0192, 159 }, { - 0x0393, 226 }, { - 0x0398, 233 }, { - 0x03A3, 228 }, { - 0x03A6, 232 }, { - 0x03A9, 234 }, { - 0x03B1, 224 }, { - 0x03B4, 235 }, { - 0x03B5, 238 }, { - 0x03C0, 227 }, { - 0x03C3, 229 }, { - 0x03C4, 231 }, { - 0x03C6, 237 }, { - 0x2022, 7 }, { - 0x203C, 19 }, { - 0x207F, 252 }, { - 0x20A7, 158 }, { - 0x2190, 27 }, { - 0x2191, 24 }, { - 0x2192, 26 }, { - 0x2193, 25 }, { - 0x2194, 29 }, { - 0x2195, 18 }, { - 0x21A8, 23 }, { - 0x2219, 249 }, { - 0x221A, 251 }, { - 0x221E, 236 }, { - 0x221F, 28 }, { - 0x2229, 239 }, { - 0x2248, 247 }, { - 0x2261, 240 }, { - 0x2264, 243 }, { - 0x2265, 242 }, { - 0x2302, 127 }, { - 0x2310, 169 }, { - 0x2320, 244 }, { - 0x2321, 245 }, { - 0x2500, 196 }, { - 0x2502, 179 }, { - 0x250C, 218 }, { - 0x2510, 191 }, { - 0x2514, 192 }, { - 0x2518, 217 }, { - 0x251C, 195 }, { - 0x2524, 180 }, { - 0x252C, 194 }, { - 0x2534, 193 }, { - 0x253C, 197 }, { - 0x2550, 205 }, { - 0x2551, 186 }, { - 0x2552, 213 }, { - 0x2553, 214 }, { - 0x2554, 201 }, { - 0x2555, 184 }, { - 0x2556, 183 }, { - 0x2557, 187 }, { - 0x2558, 212 }, { - 0x2559, 211 }, { - 0x255A, 200 }, { - 0x255B, 190 }, { - 0x255C, 189 }, { - 0x255D, 188 }, { - 0x255E, 198 }, { - 0x255F, 199 }, { - 0x2560, 204 }, { - 0x2561, 181 }, { - 0x2562, 182 }, { - 0x2563, 185 }, { - 0x2564, 209 }, { - 0x2565, 210 }, { - 0x2566, 203 }, { - 0x2567, 207 }, { - 0x2568, 208 }, { - 0x2569, 202 }, { - 0x256A, 216 }, { - 0x256B, 215 }, { - 0x256C, 206 }, { - 0x2580, 223 }, { - 0x2584, 220 }, { - 0x2588, 219 }, { - 0x258C, 221 }, { - 0x2590, 222 }, { - 0x2591, 176 }, { - 0x2592, 177 }, { - 0x2593, 178 }, { - 0x25A0, 254 }, { - 0x25AC, 22 }, { - 0x25B2, 30 }, { - 0x25BA, 16 }, { - 0x25BC, 31 }, { - 0x25C4, 17 }, { - 0x25CB, 9 }, { - 0x25D8, 8 }, { - 0x25D9, 10 }, { - 0x263A, 1 }, { - 0x263B, 2 }, { - 0x263C, 15 }, { - 0x2640, 12 }, { - 0x2642, 11 }, { - 0x2660, 6 }, { - 0x2663, 5 }, { - 0x2665, 3 }, { - 0x2666, 4 }, { - 0x266A, 13 }, { - 0x266B, 14 } + { 0x00A0, 255 }, + { 0x00A1, 173 }, + { 0x00A2, 155 }, + { 0x00A3, 156 }, + { 0x00A5, 157 }, + { 0x00A7, 21 }, + { 0x00AA, 166 }, + { 0x00AB, 174 }, + { 0x00AC, 170 }, + { 0x00B0, 248 }, + { 0x00B1, 241 }, + { 0x00B2, 253 }, + { 0x00B5, 230 }, + { 0x00B6, 20 }, + { 0x00B7, 250 }, + { 0x00BA, 167 }, + { 0x00BB, 175 }, + { 0x00BC, 172 }, + { 0x00BD, 171 }, + { 0x00BF, 168 }, + { 0x00C4, 142 }, + { 0x00C5, 143 }, + { 0x00C6, 146 }, + { 0x00C7, 128 }, + { 0x00C9, 144 }, + { 0x00D1, 165 }, + { 0x00D6, 153 }, + { 0x00DC, 154 }, + { 0x00DF, 225 }, + { 0x00E0, 133 }, + { 0x00E1, 160 }, + { 0x00E2, 131 }, + { 0x00E4, 132 }, + { 0x00E5, 134 }, + { 0x00E6, 145 }, + { 0x00E7, 135 }, + { 0x00E8, 138 }, + { 0x00E9, 130 }, + { 0x00EA, 136 }, + { 0x00EB, 137 }, + { 0x00EC, 141 }, + { 0x00ED, 161 }, + { 0x00EE, 140 }, + { 0x00EF, 139 }, + { 0x00F1, 164 }, + { 0x00F2, 149 }, + { 0x00F3, 162 }, + { 0x00F4, 147 }, + { 0x00F6, 148 }, + { 0x00F7, 246 }, + { 0x00F9, 151 }, + { 0x00FA, 163 }, + { 0x00FB, 150 }, + { 0x00FC, 129 }, + { 0x00FF, 152 }, + { 0x0192, 159 }, + { 0x0393, 226 }, + { 0x0398, 233 }, + { 0x03A3, 228 }, + { 0x03A6, 232 }, + { 0x03A9, 234 }, + { 0x03B1, 224 }, + { 0x03B4, 235 }, + { 0x03B5, 238 }, + { 0x03C0, 227 }, + { 0x03C3, 229 }, + { 0x03C4, 231 }, + { 0x03C6, 237 }, + { 0x2022, 7 }, + { 0x203C, 19 }, + { 0x207F, 252 }, + { 0x20A7, 158 }, + { 0x2190, 27 }, + { 0x2191, 24 }, + { 0x2192, 26 }, + { 0x2193, 25 }, + { 0x2194, 29 }, + { 0x2195, 18 }, + { 0x21A8, 23 }, + { 0x2219, 249 }, + { 0x221A, 251 }, + { 0x221E, 236 }, + { 0x221F, 28 }, + { 0x2229, 239 }, + { 0x2248, 247 }, + { 0x2261, 240 }, + { 0x2264, 243 }, + { 0x2265, 242 }, + { 0x2302, 127 }, + { 0x2310, 169 }, + { 0x2320, 244 }, + { 0x2321, 245 }, + { 0x2500, 196 }, + { 0x2502, 179 }, + { 0x250C, 218 }, + { 0x2510, 191 }, + { 0x2514, 192 }, + { 0x2518, 217 }, + { 0x251C, 195 }, + { 0x2524, 180 }, + { 0x252C, 194 }, + { 0x2534, 193 }, + { 0x253C, 197 }, + { 0x2550, 205 }, + { 0x2551, 186 }, + { 0x2552, 213 }, + { 0x2553, 214 }, + { 0x2554, 201 }, + { 0x2555, 184 }, + { 0x2556, 183 }, + { 0x2557, 187 }, + { 0x2558, 212 }, + { 0x2559, 211 }, + { 0x255A, 200 }, + { 0x255B, 190 }, + { 0x255C, 189 }, + { 0x255D, 188 }, + { 0x255E, 198 }, + { 0x255F, 199 }, + { 0x2560, 204 }, + { 0x2561, 181 }, + { 0x2562, 182 }, + { 0x2563, 185 }, + { 0x2564, 209 }, + { 0x2565, 210 }, + { 0x2566, 203 }, + { 0x2567, 207 }, + { 0x2568, 208 }, + { 0x2569, 202 }, + { 0x256A, 216 }, + { 0x256B, 215 }, + { 0x256C, 206 }, + { 0x2580, 223 }, + { 0x2584, 220 }, + { 0x2588, 219 }, + { 0x258C, 221 }, + { 0x2590, 222 }, + { 0x2591, 176 }, + { 0x2592, 177 }, + { 0x2593, 178 }, + { 0x25A0, 254 }, + { 0x25AC, 22 }, + { 0x25B2, 30 }, + { 0x25BA, 16 }, + { 0x25BC, 31 }, + { 0x25C4, 17 }, + { 0x25CB, 9 }, + { 0x25D8, 8 }, + { 0x25D9, 10 }, + { 0x263A, 1 }, + { 0x263B, 2 }, + { 0x263C, 15 }, + { 0x2640, 12 }, + { 0x2642, 11 }, + { 0x2660, 6 }, + { 0x2663, 5 }, + { 0x2665, 3 }, + { 0x2666, 4 }, + { 0x266A, 13 }, + { 0x266B, 14 } }; int l = 0, r = 160; while (l != r) { @@ -509,7 +508,7 @@ size_t * length) else if (xref[m].ucs4 < ucs4_character) { if (l == m) l = r; else l = m; - } + } else { if (r == m) r = l; else r = m; @@ -523,7 +522,7 @@ size_t * length) } /** Convert a UTF-8 encoded character to ASCII, with '?' replacements. */ -int unicode_utf8_to_ascii(char *cp_character, const utf8_t * utf8_string, +int unicode_utf8_to_ascii(unsigned char *cp_character, const utf8_t * utf8_string, size_t *length) { int result = unicode_utf8_to_cp437(cp_character, utf8_string, length); @@ -536,7 +535,7 @@ int unicode_utf8_to_ascii(char *cp_character, const utf8_t * utf8_string, } /** Convert a UTF-8 encoded character to CP1252. */ -int unicode_utf8_to_cp1252(char *cp_character, const utf8_t * utf8_string, +int unicode_utf8_to_cp1252(unsigned char *cp_character, const utf8_t * utf8_string, size_t * length) { ucs4_t ucs4_character; @@ -556,39 +555,38 @@ int unicode_utf8_to_cp1252(char *cp_character, const utf8_t * utf8_string, ucs4_t ucs4; unsigned char cp; } xref[] = { - { - 0x20ac, 0x80 }, { - 0x0081, 0x81 }, { - 0x201a, 0x82 }, { - 0x0192, 0x83 }, { - 0x201e, 0x84 }, { - 0x2026, 0x85 }, { - 0x2020, 0x86 }, { - 0x2021, 0x87 }, { - 0x02c6, 0x88 }, { - 0x2030, 0x89 }, { - 0x0160, 0x8a }, { - 0x2039, 0x8b }, { - 0x0152, 0x8c }, { - 0x008d, 0x8d }, { - 0x017d, 0x8e }, { - 0x008f, 0x8f }, { - 0x0090, 0x90 }, { - 0x2018, 0x91 }, { - 0x2019, 0x92 }, { - 0x201c, 0x93 }, { - 0x201d, 0x94 }, { - 0x2022, 0x95 }, { - 0x2013, 0x96 }, { - 0x2014, 0x97 }, { - 0x02dc, 0x98 }, { - 0x2122, 0x99 }, { - 0x0161, 0x9a }, { - 0x203a, 0x9b }, { - 0x0153, 0x9c }, { - 0x009d, 0x9d }, { - 0x017e, 0x9e }, { - 0x0178, 0x9f } + { 0x0081, 0x81 }, + { 0x008d, 0x8d }, + { 0x008f, 0x8f }, + { 0x0090, 0x90 }, + { 0x009d, 0x9d }, + { 0x0152, 0x8c }, + { 0x0153, 0x9c }, + { 0x0160, 0x8a }, + { 0x0161, 0x9a }, + { 0x0178, 0x9f }, + { 0x017d, 0x8e }, + { 0x017e, 0x9e }, + { 0x0192, 0x83 }, + { 0x02c6, 0x88 }, + { 0x02dc, 0x98 }, + { 0x2013, 0x96 }, + { 0x2014, 0x97 }, + { 0x2018, 0x91 }, + { 0x2019, 0x92 }, + { 0x201a, 0x82 }, + { 0x201c, 0x93 }, + { 0x201d, 0x94 }, + { 0x201e, 0x84 }, + { 0x2022, 0x95 }, + { 0x2026, 0x85 }, + { 0x2020, 0x86 }, + { 0x2021, 0x87 }, + { 0x2030, 0x89 }, + { 0x203a, 0x9b }, + { 0x2039, 0x8b }, + { 0x20ac, 0x80 }, + { 0x2122, 0x99 } }; int l = 0, r = sizeof(xref) / sizeof(xref[0]); while (l != r) { diff --git a/src/util/unicode.h b/src/util/unicode.h index b061cd6fb..df68ade02 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -28,15 +28,15 @@ extern "C" { typedef unsigned long ucs4_t; typedef char utf8_t; - int unicode_utf8_to_cp437(char *result, const utf8_t * utf8_string, + int unicode_utf8_to_cp437(unsigned char *result, const utf8_t * utf8_string, size_t * length); - int unicode_utf8_to_cp1252(char *result, const utf8_t * utf8_string, + int unicode_utf8_to_cp1252(unsigned char *result, const utf8_t * utf8_string, size_t * length); int unicode_utf8_to_ucs4(ucs4_t * result, const utf8_t * utf8_string, size_t * length); int unicode_ucs4_to_utf8(utf8_t * result, size_t * size, ucs4_t ucs4_character); - int unicode_utf8_to_ascii(char *cp_character, const utf8_t * utf8_string, + int unicode_utf8_to_ascii(unsigned char *cp_character, const utf8_t * utf8_string, size_t *length); int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t * b); int unicode_latin1_to_utf8(utf8_t * out, size_t * outlen, diff --git a/src/util/unicode.test.c b/src/util/unicode.test.c index b643775a9..dc33b02a8 100644 --- a/src/util/unicode.test.c +++ b/src/util/unicode.test.c @@ -16,20 +16,55 @@ static void test_unicode_tolower(CuTest * tc) CuAssertStrEquals(tc, "helloX", buffer); } -static void test_unicode_utf8_to_cp437(CuTest *tc) +static void test_unicode_utf8_to_other(CuTest *tc) { - const char utf8_str[4] = { 0xc3, 0x98, 'l', 0 }; // Øl - char ch; + const char utf8_str[] = { 0xc3, 0x98, 0xc5, 0xb8, 0xc2, 0x9d, 'l', 0 }; // ØŸl + unsigned char ch; size_t sz; CuAssertIntEquals(tc, 0, unicode_utf8_to_cp437(&ch, utf8_str, &sz)); CuAssertIntEquals(tc, 2, sz); CuAssertIntEquals(tc, '?', ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_cp437(&ch, utf8_str+2, &sz)); + CuAssertIntEquals(tc, 2, sz); + CuAssertIntEquals(tc, '?', ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_cp437(&ch, utf8_str+4, &sz)); + CuAssertIntEquals(tc, 2, sz); + CuAssertIntEquals(tc, '?', ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_cp437(&ch, utf8_str + 6, &sz)); + CuAssertIntEquals(tc, 1, sz); + CuAssertIntEquals(tc, 'l', ch); + + CuAssertIntEquals(tc, 0, unicode_utf8_to_cp1252(&ch, utf8_str, &sz)); + CuAssertIntEquals(tc, 2, sz); + CuAssertIntEquals(tc, 216, ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_cp1252(&ch, utf8_str+2, &sz)); + CuAssertIntEquals(tc, 2, sz); + CuAssertIntEquals(tc, 120, ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_cp1252(&ch, utf8_str + 4, &sz)); + CuAssertIntEquals(tc, 2, sz); + CuAssertIntEquals(tc, 0x9d, ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_cp1252(&ch, utf8_str + 6, &sz)); + CuAssertIntEquals(tc, 1, sz); + CuAssertIntEquals(tc, 'l', ch); + + CuAssertIntEquals(tc, 0, unicode_utf8_to_ascii(&ch, utf8_str, &sz)); + CuAssertIntEquals(tc, 2, sz); + CuAssertIntEquals(tc, '?', ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_ascii(&ch, utf8_str + 2, &sz)); + CuAssertIntEquals(tc, 2, sz); + CuAssertIntEquals(tc, '?', ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_ascii(&ch, utf8_str + 4, &sz)); + CuAssertIntEquals(tc, 2, sz); + CuAssertIntEquals(tc, '?', ch); + CuAssertIntEquals(tc, 0, unicode_utf8_to_ascii(&ch, utf8_str + 6, &sz)); + CuAssertIntEquals(tc, 1, sz); + CuAssertIntEquals(tc, 'l', ch); } CuSuite *get_unicode_suite(void) { CuSuite *suite = CuSuiteNew(); SUITE_ADD_TEST(suite, test_unicode_tolower); - SUITE_ADD_TEST(suite, test_unicode_utf8_to_cp437); + SUITE_ADD_TEST(suite, test_unicode_utf8_to_other); return suite; }