forked from github/server
A function to transliterate German special characters, for future use.
This commit is contained in:
parent
f8cadfebee
commit
2d2f566634
3 changed files with 87 additions and 5 deletions
|
@ -22,11 +22,69 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
|
|
||||||
|
#include <ctype.h>
|
||||||
#include <wctype.h>
|
#include <wctype.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
|
char * transliterate(char * out, size_t size, const char * in)
|
||||||
|
{
|
||||||
|
const char *src = in;
|
||||||
|
char *dst = out;
|
||||||
|
|
||||||
|
--size; /* need space for a final 0-byte */
|
||||||
|
while (*src && size) {
|
||||||
|
size_t len;
|
||||||
|
const char * p = src;
|
||||||
|
while ((p+size>src) && *src && (~*src & 0x80)) {
|
||||||
|
*dst++ = (char)tolower(*src++);
|
||||||
|
}
|
||||||
|
len = src-p;
|
||||||
|
size -= len;
|
||||||
|
while (size>=2 && *src && (*src & 0x80)) {
|
||||||
|
int advance = 2;
|
||||||
|
if (src[0]=='\xc3') {
|
||||||
|
if (src[1]=='\xa4' || src[1]=='\x84') {
|
||||||
|
memcpy(dst, "ae", 2);
|
||||||
|
} else if (src[1]=='\xb6' || src[1]=='\x96') {
|
||||||
|
memcpy(dst, "oe", 2);
|
||||||
|
} else if (src[1]=='\xbc' || src[1]=='\x9c') {
|
||||||
|
memcpy(dst, "ue", 2);
|
||||||
|
} else if (src[1]=='\x9f') {
|
||||||
|
memcpy(dst, "ss", 2);
|
||||||
|
} else {
|
||||||
|
*dst++='?';
|
||||||
|
advance = 0;
|
||||||
|
}
|
||||||
|
} else if (src[0]=='\xe1') {
|
||||||
|
if (src[1]=='\xba' && src[2]=='\x9e') {
|
||||||
|
memcpy(dst, "ss", 2);
|
||||||
|
++src;
|
||||||
|
} else {
|
||||||
|
advance = 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
advance = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (advance) {
|
||||||
|
src+=advance;
|
||||||
|
dst+=advance;
|
||||||
|
size-=advance;
|
||||||
|
} else {
|
||||||
|
ucs4_t ucs;
|
||||||
|
unicode_utf8_to_ucs4(&ucs, src, &len);
|
||||||
|
src+=len;
|
||||||
|
*dst++='?';
|
||||||
|
--size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*dst = 0;
|
||||||
|
return *src ? 0 : out;
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct tref {
|
typedef struct tref {
|
||||||
struct tref *nexthash;
|
struct tref *nexthash;
|
||||||
ucs4_t ucs;
|
ucs4_t ucs;
|
||||||
|
@ -110,7 +168,7 @@ void addtoken(tnode * root, const char *str, variant id)
|
||||||
#else
|
#else
|
||||||
index = lcs % NODEHASHSIZE;
|
index = lcs % NODEHASHSIZE;
|
||||||
#endif
|
#endif
|
||||||
ref = malloc(sizeof(tref));
|
ref = (tref *)malloc(sizeof(tref));
|
||||||
ref->ucs = lcs;
|
ref->ucs = lcs;
|
||||||
ref->node = node;
|
ref->node = node;
|
||||||
ref->nexthash = root->next[index];
|
ref->nexthash = root->next[index];
|
||||||
|
|
|
@ -40,6 +40,8 @@ extern "C" {
|
||||||
void addtoken(struct tnode *root, const char *str, variant id);
|
void addtoken(struct tnode *root, const char *str, variant id);
|
||||||
void freetokens(struct tnode *root);
|
void freetokens(struct tnode *root);
|
||||||
|
|
||||||
|
char * transliterate(char * out, size_t size, const char * in);
|
||||||
|
|
||||||
typedef struct local_names {
|
typedef struct local_names {
|
||||||
struct local_names *next;
|
struct local_names *next;
|
||||||
const struct locale *lang;
|
const struct locale *lang;
|
||||||
|
|
|
@ -1,28 +1,48 @@
|
||||||
#include <cutest/CuTest.h>
|
#include <cutest/CuTest.h>
|
||||||
|
#include <ctype.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "umlaut.h"
|
#include "umlaut.h"
|
||||||
|
|
||||||
|
static void test_transliterate(CuTest * tc)
|
||||||
|
{
|
||||||
|
char buffer[32];
|
||||||
|
|
||||||
|
CuAssertStrEquals(tc, "", transliterate(buffer, sizeof(buffer), ""));
|
||||||
|
CuAssertStrEquals(tc, "herpderp", transliterate(buffer, sizeof(buffer), "herpderp"));
|
||||||
|
CuAssertStrEquals(tc, "herpderp", buffer);
|
||||||
|
|
||||||
|
CuAssertStrEquals(tc, "herpderp", transliterate(buffer, sizeof(buffer), "HERPDERP"));
|
||||||
|
CuAssertStrEquals(tc, "haerpdaerp", transliterate(buffer, sizeof(buffer), "h\xc3\xa4rpd\xc3\xa4rp"));
|
||||||
|
CuAssertStrEquals(tc, "aeoeuess", transliterate(buffer, sizeof(buffer), "\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f"));
|
||||||
|
CuAssertStrEquals(tc, "aeoeuess", transliterate(buffer, sizeof(buffer), "\xc3\x84\xc3\x96\xc3\x9c\xe1\xba\x9e"));
|
||||||
|
CuAssertStrEquals(tc, 0, transliterate(buffer, 4, "herpderp"));
|
||||||
|
}
|
||||||
|
|
||||||
static void test_umlaut(CuTest * tc)
|
static void test_umlaut(CuTest * tc)
|
||||||
{
|
{
|
||||||
char umlauts[] = { 0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc, 0xc3, 0x9f, 0 }; /* auml ouml uuml szlig nul */
|
const char * umlauts = "\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f"; /* auml ouml uuml szlig nul */
|
||||||
tnode tokens = { 0 };
|
tnode tokens = { 0 };
|
||||||
variant id;
|
variant id;
|
||||||
int result;
|
int result;
|
||||||
|
|
||||||
|
/* don't crash on an empty set */
|
||||||
|
result = findtoken(&tokens, "herpderp", &id);
|
||||||
|
CuAssertIntEquals(tc, E_TOK_NOMATCH, result);
|
||||||
|
|
||||||
id.i = 1;
|
id.i = 1;
|
||||||
addtoken(&tokens, "herp", id);
|
addtoken(&tokens, "herpderp", id);
|
||||||
id.i = 2;
|
id.i = 2;
|
||||||
addtoken(&tokens, "derp", id);
|
addtoken(&tokens, "derp", id);
|
||||||
id.i = 3;
|
id.i = 3;
|
||||||
addtoken(&tokens, umlauts, id);
|
addtoken(&tokens, umlauts, id);
|
||||||
|
|
||||||
|
/* we can find substrings if they are significant */
|
||||||
result = findtoken(&tokens, "herp", &id);
|
result = findtoken(&tokens, "herp", &id);
|
||||||
CuAssertIntEquals(tc, E_TOK_SUCCESS, result);
|
CuAssertIntEquals(tc, E_TOK_SUCCESS, result);
|
||||||
CuAssertIntEquals(tc, 1, id.i);
|
CuAssertIntEquals(tc, 1, id.i);
|
||||||
|
|
||||||
result = findtoken(&tokens, "derp", &id);
|
result = findtoken(&tokens, "DERP", &id);
|
||||||
CuAssertIntEquals(tc, E_TOK_SUCCESS, result);
|
CuAssertIntEquals(tc, E_TOK_SUCCESS, result);
|
||||||
CuAssertIntEquals(tc, 2, id.i);
|
CuAssertIntEquals(tc, 2, id.i);
|
||||||
|
|
||||||
|
@ -30,11 +50,12 @@ static void test_umlaut(CuTest * tc)
|
||||||
CuAssertIntEquals(tc, E_TOK_SUCCESS, result);
|
CuAssertIntEquals(tc, E_TOK_SUCCESS, result);
|
||||||
CuAssertIntEquals(tc, 3, id.i);
|
CuAssertIntEquals(tc, 3, id.i);
|
||||||
|
|
||||||
|
/* transliteration is the real magic */
|
||||||
result = findtoken(&tokens, "AEoeUEss", &id);
|
result = findtoken(&tokens, "AEoeUEss", &id);
|
||||||
CuAssertIntEquals(tc, E_TOK_SUCCESS, result);
|
CuAssertIntEquals(tc, E_TOK_SUCCESS, result);
|
||||||
CuAssertIntEquals(tc, 3, id.i);
|
CuAssertIntEquals(tc, 3, id.i);
|
||||||
|
|
||||||
result = findtoken(&tokens, "herpderp", &id);
|
result = findtoken(&tokens, "herp-a-derp", &id);
|
||||||
CuAssertIntEquals(tc, E_TOK_NOMATCH, result);
|
CuAssertIntEquals(tc, E_TOK_NOMATCH, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,5 +63,6 @@ CuSuite *get_umlaut_suite(void)
|
||||||
{
|
{
|
||||||
CuSuite *suite = CuSuiteNew();
|
CuSuite *suite = CuSuiteNew();
|
||||||
SUITE_ADD_TEST(suite, test_umlaut);
|
SUITE_ADD_TEST(suite, test_umlaut);
|
||||||
|
SUITE_ADD_TEST(suite, test_transliterate);
|
||||||
return suite;
|
return suite;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue