From 2d2f5666343f58878130f4f721b5a79262087ad5 Mon Sep 17 00:00:00 2001 From: Enno Rehling Date: Tue, 15 May 2012 00:06:27 -0700 Subject: [PATCH] A function to transliterate German special characters, for future use. --- src/util/umlaut.c | 60 +++++++++++++++++++++++++++++++++++++++++- src/util/umlaut.h | 2 ++ src/util/umlaut_test.c | 30 ++++++++++++++++++--- 3 files changed, 87 insertions(+), 5 deletions(-) diff --git a/src/util/umlaut.c b/src/util/umlaut.c index 484ef12a6..cc638a732 100644 --- a/src/util/umlaut.c +++ b/src/util/umlaut.c @@ -22,11 +22,69 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "log.h" #include "unicode.h" +#include #include #include #include #include +char * transliterate(char * out, size_t size, const char * in) +{ + const char *src = in; + char *dst = out; + + --size; /* need space for a final 0-byte */ + while (*src && size) { + size_t len; + const char * p = src; + while ((p+size>src) && *src && (~*src & 0x80)) { + *dst++ = (char)tolower(*src++); + } + len = src-p; + size -= len; + while (size>=2 && *src && (*src & 0x80)) { + int advance = 2; + if (src[0]=='\xc3') { + if (src[1]=='\xa4' || src[1]=='\x84') { + memcpy(dst, "ae", 2); + } else if (src[1]=='\xb6' || src[1]=='\x96') { + memcpy(dst, "oe", 2); + } else if (src[1]=='\xbc' || src[1]=='\x9c') { + memcpy(dst, "ue", 2); + } else if (src[1]=='\x9f') { + memcpy(dst, "ss", 2); + } else { + *dst++='?'; + advance = 0; + } + } else if (src[0]=='\xe1') { + if (src[1]=='\xba' && src[2]=='\x9e') { + memcpy(dst, "ss", 2); + ++src; + } else { + advance = 0; + } + } else { + advance = 0; + } + + if (advance) { + src+=advance; + dst+=advance; + size-=advance; + } else { + ucs4_t ucs; + unicode_utf8_to_ucs4(&ucs, src, &len); + src+=len; + *dst++='?'; + --size; + } + } + } + *dst = 0; + return *src ? 0 : out; +} + typedef struct tref { struct tref *nexthash; ucs4_t ucs; @@ -110,7 +168,7 @@ void addtoken(tnode * root, const char *str, variant id) #else index = lcs % NODEHASHSIZE; #endif - ref = malloc(sizeof(tref)); + ref = (tref *)malloc(sizeof(tref)); ref->ucs = lcs; ref->node = node; ref->nexthash = root->next[index]; diff --git a/src/util/umlaut.h b/src/util/umlaut.h index d19f35b2d..2910d0d48 100644 --- a/src/util/umlaut.h +++ b/src/util/umlaut.h @@ -40,6 +40,8 @@ extern "C" { void addtoken(struct tnode *root, const char *str, variant id); void freetokens(struct tnode *root); + char * transliterate(char * out, size_t size, const char * in); + typedef struct local_names { struct local_names *next; const struct locale *lang; diff --git a/src/util/umlaut_test.c b/src/util/umlaut_test.c index c137a7ab7..3b3995abc 100644 --- a/src/util/umlaut_test.c +++ b/src/util/umlaut_test.c @@ -1,28 +1,48 @@ #include +#include #include #include #include "umlaut.h" +static void test_transliterate(CuTest * tc) +{ + char buffer[32]; + + CuAssertStrEquals(tc, "", transliterate(buffer, sizeof(buffer), "")); + CuAssertStrEquals(tc, "herpderp", transliterate(buffer, sizeof(buffer), "herpderp")); + CuAssertStrEquals(tc, "herpderp", buffer); + + CuAssertStrEquals(tc, "herpderp", transliterate(buffer, sizeof(buffer), "HERPDERP")); + CuAssertStrEquals(tc, "haerpdaerp", transliterate(buffer, sizeof(buffer), "h\xc3\xa4rpd\xc3\xa4rp")); + CuAssertStrEquals(tc, "aeoeuess", transliterate(buffer, sizeof(buffer), "\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f")); + CuAssertStrEquals(tc, "aeoeuess", transliterate(buffer, sizeof(buffer), "\xc3\x84\xc3\x96\xc3\x9c\xe1\xba\x9e")); + CuAssertStrEquals(tc, 0, transliterate(buffer, 4, "herpderp")); +} static void test_umlaut(CuTest * tc) { - char umlauts[] = { 0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc, 0xc3, 0x9f, 0 }; /* auml ouml uuml szlig nul */ + const char * umlauts = "\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f"; /* auml ouml uuml szlig nul */ tnode tokens = { 0 }; variant id; int result; + /* don't crash on an empty set */ + result = findtoken(&tokens, "herpderp", &id); + CuAssertIntEquals(tc, E_TOK_NOMATCH, result); + id.i = 1; - addtoken(&tokens, "herp", id); + addtoken(&tokens, "herpderp", id); id.i = 2; addtoken(&tokens, "derp", id); id.i = 3; addtoken(&tokens, umlauts, id); + /* we can find substrings if they are significant */ result = findtoken(&tokens, "herp", &id); CuAssertIntEquals(tc, E_TOK_SUCCESS, result); CuAssertIntEquals(tc, 1, id.i); - result = findtoken(&tokens, "derp", &id); + result = findtoken(&tokens, "DERP", &id); CuAssertIntEquals(tc, E_TOK_SUCCESS, result); CuAssertIntEquals(tc, 2, id.i); @@ -30,11 +50,12 @@ static void test_umlaut(CuTest * tc) CuAssertIntEquals(tc, E_TOK_SUCCESS, result); CuAssertIntEquals(tc, 3, id.i); + /* transliteration is the real magic */ result = findtoken(&tokens, "AEoeUEss", &id); CuAssertIntEquals(tc, E_TOK_SUCCESS, result); CuAssertIntEquals(tc, 3, id.i); - result = findtoken(&tokens, "herpderp", &id); + result = findtoken(&tokens, "herp-a-derp", &id); CuAssertIntEquals(tc, E_TOK_NOMATCH, result); } @@ -42,5 +63,6 @@ CuSuite *get_umlaut_suite(void) { CuSuite *suite = CuSuiteNew(); SUITE_ADD_TEST(suite, test_umlaut); + SUITE_ADD_TEST(suite, test_transliterate); return suite; }