- added a testsuite

- testcase skill-parser - findskill uses patricia
2008-04-25 14:31:38 +00:00 · 2008-04-25 14:31:38 +00:00 · 1dd05538ad
commit 1dd05538ad
parent a2abcfe177
21 changed files with 634 additions and 91 deletions
--- a/src/common/gamecode/laws.c
+++ b/src/common/gamecode/laws.c
@ -3775,7 +3775,7 @@ static void reset_rng(void) {

 static void reset_rng_region(region * r)
 {
-  rng_init(r->index);
+  rng_init(r->index+turn);
 }

 /** warn about passwords that are not US ASCII.
--- a/src/common/gamecode/xmlreport.c
+++ b/src/common/gamecode/xmlreport.c
@ -87,11 +87,11 @@ xml_s(const char * str)
 {
  static xmlChar buffer[1024];
  const char * inbuf = str;
-  unsigned char * outbuf = buffer;
+  char * outbuf = (char *)buffer;
  size_t inbytes = strlen(str)+1;
  size_t outbytes = sizeof(buffer) - 1;

-  unicode_latin1_to_utf8(outbuf, &outbytes, (const xmlChar *)inbuf, &inbytes);
+  unicode_latin1_to_utf8(outbuf, &outbytes, inbuf, &inbytes);
  buffer[outbytes] = 0;
  return buffer;
 }
--- a/src/common/kernel/eressea.c
+++ b/src/common/kernel/eressea.c
@ -68,6 +68,7 @@
 #include <util/umlaut.h>
 #include <util/xml.h>
 #include <util/bsdstring.h>
+#include <util/unicode.h>

 /* libxml includes */
 #include <libxml/tree.h>
@ -83,6 +84,11 @@
 #include <time.h>
 #include <errno.h>

+#define PTRIES 1
+#if PTRIES
+#include <util/patricia.h>
+#endif
+
 /* exported variables */
 region  *regions;
 faction *factions;
@ -1332,14 +1338,126 @@ findoption(const char *s, const struct locale * lang)
  return NODIRECTION;
 }

+#if PTRIES
+static struct trie_node * ptries[UT_MAX][4];
+
+static struct trie_node **
+get_ptrie(const struct locale * lang, int type)
+{
+  int index = (strcmp(locale_name(lang), "de")==0);
+  return &(ptries[type][index]);
+}
+
+static int
+umlaut_substitution(const char * ip, char * op, size_t outlen)
+{
+#define UMAX 7
+  static struct replace {
+    ucs4_t ucs;
+    const char str[3];
+  } replace[UMAX] = {
+    /* match lower-case (!) umlauts and others to transcriptions */
+    { 223, "ss"}, /* szlig */
+    { 228, "ae"}, /* auml */
+    { 229, "aa"}, /* norsk */
+    { 230, "ae"}, /* norsk */
+    { 246, "oe"}, /* ouml */
+    { 248, "oe"}, /* norsk */
+    { 252, "ue"}, /* uuml */
+  };
+  int subs = 0;
+  while (*ip) {
+    ucs4_t ucs = *ip;
+    size_t size = 1;
+    size_t cpsize = 1;
+
+    if (ucs & 0x80) {
+      int ret = unicode_utf8_to_ucs4(&ucs, ip, &size);
+      if (ret!=0) {
+        return ret;
+      }
+      cpsize = size;
+      if (ucs >= replace[0].ucs && ucs <= replace[UMAX-1].ucs) {
+        int i;
+        for (i=0;i!=UMAX;++i) {
+          if (replace[i].ucs==ucs) {
+            cpsize = 0;
+            memcpy(op, replace[i].str, 2);
+            op+=2;
+            ++subs;
+            break;
+          }
+        }
+      }
+    }
+    if (cpsize) {
+      if (cpsize>outlen) {
+        return -1;
+      }
+      memcpy(op, ip, cpsize);
+    }
+
+    ip += size;
+    op += cpsize;
+    outlen -= cpsize;
+  }
+
+  if (outlen<=0) {
+    return -1;
+  }
+  *op = 0;
+  return subs;
+}
+
+static int
+ptrie_find(struct trie_node *ptrie, const char * key, void * data, size_t size)
+{
+  trie_node * node = trie_find_prefix(ptrie, key);
+  if (node) {
+    void * result = trie_getdata(node);
+    memcpy(data, result, size);
+    return 0;
+  }
+  return -1;
+}
+
+static int
+ptrie_insert(struct trie_node **ptrie, const char * name, void * data, size_t size)
+{
+  char converted[256];
+  char simple[256];
+  int ret = unicode_utf8_tolower(converted, 256, name);
+  if (ret==0) {
+    int subs = umlaut_substitution(converted, simple, sizeof(simple));
+    if (subs>0) {
+      trie_insert(ptrie, simple, data, size);
+    }
+    trie_insert(ptrie, converted, data, size);
+  }
+  return ret;
+}
+#endif
+
 skill_t
 findskill(const char *s, const struct locale * lang)
 {
+#if PTRIES
+  char lowercase[256];
+  int res = unicode_utf8_tolower(lowercase, sizeof(lowercase), s);
+  if (res==0) {
+    trie_node ** ptrie = get_ptrie(lang, UT_SKILLS);
+    skill_t sk;
+    int result = ptrie_find(*ptrie, lowercase, &sk, sizeof(sk));
+    if (result==0) return sk;
+  }
+  return NOSKILL;
+#else
  struct tnode * tokens = get_translations(lang, UT_SKILLS);
  variant token;

  if (findtoken(tokens, s, &token)==E_TOK_NOMATCH) return NOSKILL;
  return (skill_t)token.i;
+#endif
 }

 keyword_t
@ -1954,6 +2072,9 @@ init_locale(const struct locale * lang)
  const struct race * rc;
  struct tnode * tokens;
  const terrain_type * terrain;
+#if PTRIES
+  trie_node ** ptrie;
+#endif

  tokens = get_translations(lang, UT_MAGIC);
  for (i=0;i!=MAXMAGIETYP;++i) {
@ -1976,7 +2097,18 @@ init_locale(const struct locale * lang)
    var.i = i;
    addtoken(tokens, LOC(lang, parameters[i]), var);
  }
-
+#if PTRIES
+  ptrie = get_ptrie(lang, UT_SKILLS);
+  for (i=0;i!=MAXSKILLS;++i) {
+    if (i!=SK_TRADE || !TradeDisabled()) {
+      skill_t sk = (skill_t)i;
+      const char * skname = skillname(sk, lang);
+      if (skname!=NULL) {
+        ptrie_insert(ptrie, skname, &sk, sizeof(sk));
+      }
+    }
+  }
+#else
  tokens = get_translations(lang, UT_SKILLS);
  for (i=0;i!=MAXSKILLS;++i) {
    if (i!=SK_TRADE || !TradeDisabled()) {
@ -1987,6 +2119,7 @@ init_locale(const struct locale * lang)
      }
    }
  }
+#endif

  tokens = get_translations(lang, UT_KEYWORDS);
  for (i=0;i!=MAXKEYWORDS;++i) {
--- a/src/common/kernel/names.c
+++ b/src/common/kernel/names.c
@ -384,7 +384,7 @@ abkz(const char *s, char * buf, size_t buflen, size_t maxchars)
  char * bufp;
 	unsigned int c = 0;
 	size_t bpt, i;
-  wint_t ucs;
+  ucs4_t ucs;
  size_t size;
  int result;

@ -401,7 +401,7 @@ abkz(const char *s, char * buf, size_t buflen, size_t maxchars)
    assert(result==0 || "damnit, we're not handling invalid input here!");

    /* Leerzeichen überspringen */
-    while (*p != 0 && !iswalnum(ucs)) {
+    while (*p != 0 && !iswalnum((wint_t)ucs)) {
 			p += size;
      result = unicode_utf8_to_ucs4(&ucs, p, &size);
      assert(result==0 || "damnit, we're not handling invalid input here!");
@ -411,7 +411,7 @@ abkz(const char *s, char * buf, size_t buflen, size_t maxchars)
    if (*p != 0) ++c;

 		/* alnums überspringen */
-    while (*p != 0 && iswalnum(ucs)) {
+    while (*p != 0 && iswalnum((wint_t)ucs)) {
 			p+=size;
      result = unicode_utf8_to_ucs4(&ucs, p, &size);
      assert(result==0 || "damnit, we're not handling invalid input here!");
@ -434,7 +434,7 @@ abkz(const char *s, char * buf, size_t buflen, size_t maxchars)
 	while (*p != 0 && c < maxchars) {
 		/* Leerzeichen überspringen */

-    while (*p != 0 && !iswalnum(ucs)) {
+    while (*p != 0 && !iswalnum((wint_t)ucs)) {
 			p+=size;
      result = unicode_utf8_to_ucs4(&ucs, p, &size);
      assert(result==0 || "damnit, we're not handling invalid input here!");
@ -442,7 +442,7 @@ abkz(const char *s, char * buf, size_t buflen, size_t maxchars)

 		/* alnums übertragen */

-		for (i = 0; i < bpt && *p != 0 && iswalnum(ucs); ++i) {
+		for (i = 0; i < bpt && *p != 0 && iswalnum((wint_t)ucs); ++i) {
 			memcpy(bufp, p, size);
 			p += size;
      bufp += size;
@ -454,7 +454,7 @@ abkz(const char *s, char * buf, size_t buflen, size_t maxchars)

 		/* Bis zum nächsten Leerzeichen */

-    while (c < maxchars && *p != 0 && iswalnum(ucs)) {
+    while (c < maxchars && *p != 0 && iswalnum((wint_t)ucs)) {
 			p+=size;
      result = unicode_utf8_to_ucs4(&ucs, p, &size);
      assert(result==0 || "damnit, we're not handling invalid input here!");
--- a/src/common/kernel/save.c
+++ b/src/common/kernel/save.c
@ -178,7 +178,7 @@ freadstr(FILE * F, int encoding, char * start, size_t size)
                char inbuf = (char)c;
                size_t inbytes = 1;
                size_t outbytes = size-(str-start);
-                int ret = unicode_latin1_to_utf8((xmlChar *)str, &outbytes, (const xmlChar *)&inbuf, &inbytes);
+                int ret = unicode_latin1_to_utf8(str, &outbytes, &inbuf, &inbytes);
                if (ret>0) str+=ret;
                else {
                  log_error(("input data was not iso-8859-1! assuming utf-8\n"));
@ -197,7 +197,7 @@ freadstr(FILE * F, int encoding, char * start, size_t size)
            char inbuf = (char)c;
            size_t inbytes = 1;
            size_t outbytes = size-(str-start);
-            int ret = unicode_latin1_to_utf8((xmlChar *)str, &outbytes, (const xmlChar *)&inbuf, &inbytes);
+            int ret = unicode_latin1_to_utf8(str, &outbytes, &inbuf, &inbytes);
            if (ret>0) str+=ret;
            else {
              log_error(("input data was not iso-8859-1! assuming utf-8\n"));
--- a/src/common/modules/autoseed.c
+++ b/src/common/modules/autoseed.c
@ -246,7 +246,7 @@ read_newfactions(const char * filename)
        char buffer[32];
        size_t outbytes = sizeof(buffer) - 1;
        size_t inbytes = strlen(race);
-        unicode_latin1_to_utf8((unsigned char *)buffer, &outbytes, (const unsigned char *)race, &inbytes);
+        unicode_latin1_to_utf8(buffer, &outbytes, race, &inbytes);
        buffer[outbytes] = 0;
        nf->race = findrace(buffer, default_locale);
        if (nf->race==NULL) {
--- a/src/common/util/filereader.c
+++ b/src/common/util/filereader.c
@ -29,11 +29,11 @@ eatwhite(const char * ptr, size_t * total_size)
  *total_size = 0;

  while (*ptr) {
-    wint_t ucs;
+    ucs4_t ucs;
    size_t size = 0;
    ret = unicode_utf8_to_ucs4(&ucs, ptr, &size);
    if (ret!=0) break;
-    if (!iswspace(ucs)) break;
+    if (!iswspace((wint_t)ucs)) break;
    *total_size += size;
    ptr += size;
  }
@ -149,7 +149,7 @@ getbuf_latin1(FILE * F)
        char inbuf = (char)c;
        size_t inbytes = 1;
        size_t outbytes = MAXLINE-(cp-fbuf);
-        int ret = unicode_latin1_to_utf8((xmlChar *)cp, &outbytes, (const xmlChar *)&inbuf, &inbytes);
+        int ret = unicode_latin1_to_utf8(cp, &outbytes, &inbuf, &inbytes);
        if (ret>0) cp+=ret;
        else {
          log_error(("input data was not iso-8859-1! assuming utf-8\n"));
@ -213,7 +213,7 @@ getbuf_utf8(FILE * F)
    }
    cont = false;
    while (*bp && cp<fbuf+MAXLINE) {
-      wint_t ucs;
+      ucs4_t ucs;
      size_t size;
      int ret;
      
@ -244,7 +244,7 @@ getbuf_utf8(FILE * F)
        break;
      }

-      if (iswspace(ucs)) {
+      if (iswspace((wint_t)ucs)) {
        if (!quote) {
          bp += size;
          ret = eatwhite(bp, &size);
@ -264,7 +264,7 @@ getbuf_utf8(FILE * F)
        } else {
          bp+=size;
        }
-      } else if (iswcntrl(ucs)) {
+      } else if (iswcntrl((wint_t)ucs)) {
        if (!comment && cp<fbuf+MAXLINE) {
          *cp++ = '?';
        }
--- a/src/common/util/goodies.c
+++ b/src/common/util/goodies.c
@ -74,32 +74,6 @@ set_string (char **s, const char *neu)
  return *s;
 }

-boolean
-locale_check(void) 
-{
-  int i, errorlevel = 0;
-  const unsigned char * umlaute = (const unsigned char*)"äöüÄÖÜß";
-  unsigned char result[32];
-  size_t inbytes = strlen((const char *)umlaute);
-  size_t outbytes = sizeof(result);
-  int ret = unicode_latin1_to_utf8(result, &outbytes, umlaute, &inbytes);
-  if (ret<=0) {
-    ++errorlevel;
-  }
-  /* E: das testet, ob umlaute funktionieren. Wenn äöü nicht mit isalpha() true sind, kriegen wir ärger. */
-  for (i=0;i!=3;++i) {
-    if (towupper(umlaute[i])!=(int)umlaute[i+3]) {
-      ++errorlevel;
-    }
-  }
-  for (i=0;umlaute[i]!=0;++i) {
-    if (!iswalpha(umlaute[i]) || iswspace(umlaute[i]) || iswcntrl(umlaute[i])) {
-      ++errorlevel;
-    }
-  }
-  if (errorlevel) return false;
-  return true;
-}

 static int 
 spc_email_isvalid(const char *address) 
--- a/src/common/util/goodies.h
+++ b/src/common/util/goodies.h
@ -18,7 +18,6 @@
 extern "C" {
 #endif

-extern boolean locale_check(void);
 extern char * set_string(char **s, const char *neu);
 extern int set_email(char** pemail, const char *newmail);

--- a/src/common/util/parser.c
+++ b/src/common/util/parser.c
@ -23,7 +23,7 @@ static int
 eatwhitespace_c(const char ** str)
 {
  int ret;
-  wint_t ucs;
+  ucs4_t ucs;
  size_t len;

  /* skip over potential whitespace */
@ -38,7 +38,7 @@ eatwhitespace_c(const char ** str)
        log_warning(("illegal character sequence in UTF8 string: %s\n", *str));
        return ret;
      }
-      if (!iswspace(ucs)) break;
+      if (!iswspace((wint_t)ucs)) break;
      *str+=len;
    }
  }
@ -89,7 +89,7 @@ skip_token(void)
  eatwhitespace_c(&state->current_token);

  while (*state->current_token) {
-    wint_t ucs;
+    ucs4_t ucs;
    size_t len;

    unsigned char utf8_character = (unsigned char)state->current_token[0];
@ -104,7 +104,7 @@ skip_token(void)
        log_warning(("illegal character sequence in UTF8 string: %s\n", state->current_token));
      }
    }
-    if (iswspace(ucs) && quotechar==0) {
+    if (iswspace((wint_t)ucs) && quotechar==0) {
      return;
    } else {
      switch(utf8_character) {
@ -134,7 +134,7 @@ parse_token(const char ** str)

  eatwhitespace_c(&ctoken);
  while (*ctoken && cursor-lbuf < MAXTOKENSIZE-1) {
-    wint_t ucs;
+    ucs4_t ucs;
    size_t len;
    boolean copy = false;

@ -152,7 +152,7 @@ parse_token(const char ** str)
    if (escape) {
      copy = true;
      escape = false;
-    } else if (iswspace(ucs)) {
+    } else if (iswspace((wint_t)ucs)) {
      if (quotechar==0) break;
      copy = true;
    } else if (utf8_character=='"' || utf8_character=='\'') {
--- a/src/common/util/patricia.c
+++ b/src/common/util/patricia.c
@ -0,0 +1,236 @@
+#include <config.h>
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "patricia.h"
+
+#define MAXKEYLEN 128
+
+/* TODO: custom memory management to optimize cache layout, or use arrays. */
+
+/* NOTE: The structure saves an extra 0 delimiter for the key. Technically 
+ * this wouldn't be necessary (because we know its' length from data[0]), 
+ * but it makes it possible for trie_getkey to return a key without making 
+ * a copy or have a cumbersome (const char**, size_t*) interface.
+ *       +-----------+-------------+------+------------+
+ * data: | keylen(1) | key(keylen) | 0(1) | data(size) |
+ *       +-----------+-------------+------+------------+
+ */
+
+struct trie_node {
+  struct trie_node *l, *r;
+  char * data;
+  unsigned int bitpos;
+};
+
+#if 1
+#define get_bit(c, s, p) (unsigned int)((((p)>>3)>(unsigned int)(s))?0:((c)[(p)>>3]>>((p)&7)&1))
+#else
+unsigned int get_bit(const char * c, size_t s, unsigned int p)
+{
+  if ((p>>3)>=(unsigned int)s) return 0;
+  return ((c)[p>>3]>>(p&7)&1);
+}
+#endif
+#define node_bit(n, p) get_bit((n)->data+1, (n)->data[0], (p))
+
+trie_node * trie_insert(trie_node **root_p, const char * key, const void * data, size_t size)
+{
+  trie_node * new_node;
+  size_t keylen = strlen(key);
+  trie_node ** insert_p = root_p, *node = *insert_p;
+  unsigned int p, bit=0;
+
+  assert(keylen<MAXKEYLEN);
+
+  for (p=0;p!=keylen*8+1;++p) {
+    bit = get_bit(key, keylen, p);
+
+    /* NULL-pointers lead to someplace we haven't got a prefix yet. */
+    if (node==NULL) {
+      break;
+    }
+
+    /* if we have the full prefix that the current node represents, move on */
+    if (p==node->bitpos) {
+      insert_p = bit?&node->r:&node->l;
+      node = *insert_p;
+      if (node==NULL) {
+        continue;
+      }
+    }
+
+    /* if we are looking at a back-node, we need to add our node before it. */
+    if (p>=node->bitpos) {
+      /* find the point p where both differ. */
+      if (keylen==(unsigned int)node->data[0] && strncmp(key, node->data+1, keylen)==0) {
+        /* we are trying to insert the same key again */
+
+        return node;
+      }
+      do {
+        ++p;
+        bit = get_bit(key, keylen, p);
+      } while (node_bit(node, p)==bit);
+      break;
+    }
+
+    /* if instead we differ before reaching the end of the current prefix, we must split.
+     * we insert our node before the current one and re-attach it. */
+    if (node_bit(node, p)!=bit) {
+      break;
+    }
+  }
+
+  new_node = (trie_node *)malloc(sizeof(trie_node));
+  new_node->bitpos = p;
+  new_node->data = malloc(keylen+2+size);
+  new_node->data[0] = (char)keylen;
+  memcpy(new_node->data+1, key, keylen+1);
+  if (data!=NULL && size>0) {
+    /* if data is NULL then the user only wanted some space that they're going to write to later */
+    /* if size is 0 then the user is using the trie as a set, not a map */
+    memcpy(new_node->data+2+keylen, data, size);
+  }
+  if (bit) {
+    new_node->l = node;
+    new_node->r = new_node; /* loop the 1-bit to ourselves, search will end */
+  } else {
+    new_node->l = new_node; /* loop the 0-bit to ourselves, search will end */
+    new_node->r = node;
+  }
+  *insert_p = new_node;
+  return new_node;
+}
+
+void trie_remove(trie_node **root_p, trie_node *pos)
+{
+  if (pos!=NULL) {
+    const char * key = trie_getkey(pos);
+    size_t keylen = pos->data[0];
+    trie_node ** node_p = root_p;
+    trie_node * node = *root_p;
+
+    while (node) {
+      int bit;
+      trie_node ** next_p;
+      trie_node * next;
+
+      if (node == pos) {
+        if (node->l==node) {
+          *node_p = node->r;
+          break;
+        } else if (node->r==node) {
+          *node_p = node->l;
+          break;
+        }
+      }
+      
+      bit = get_bit(key, keylen, node->bitpos);
+      next_p = bit?&node->r:&node->l;
+      next = *next_p;
+      if (next == pos && next->bitpos<=node->bitpos) {
+        /* the element that has a back-pointer to pos gets swapped with pos */
+        char * data = pos->data;
+        pos->data = node->data;
+        node->data = data;
+
+        /* finally, find the back-pointer to node and set it to pos */
+        next_p = bit?&node->l:&node->r; /* NB: this is the OTHER child of node */
+        next = *next_p;
+        key = trie_getkey(node);
+        keylen = (unsigned int)node->data[0];
+        while (next) {
+          int new_bit;
+          if (next==node) {
+            *next_p = pos;
+            break;
+          }
+          new_bit = get_bit(key, keylen, next->bitpos);
+          next_p = new_bit?&next->r:&next->l;
+          next = *next_p;
+        }
+        *node_p = bit?node->l:node->r;
+        break;
+      }
+      node = *next_p;
+      node_p = next_p;
+    }
+    free(node->data);
+    free(node);
+  }
+}
+
+void trie_debug(trie_node * root)
+{
+  const char * l = root->l?trie_getkey(root->l):"?";
+  const char * r = root->r?trie_getkey(root->r):"?";
+  printf("%s %d | %s | %s\n", trie_getkey(root), root->bitpos, l, r);
+  if (root->l && root->l->bitpos > root->bitpos) trie_debug(root->l);
+  if (root->r && root->r->bitpos > root->bitpos) trie_debug(root->r);
+}
+
+trie_node * trie_find(trie_node *root, const char *key)
+{
+  trie_node * node = root;
+  size_t keylen = strlen(key);
+
+  while (node) {
+    int bit = get_bit(key, keylen, node->bitpos);
+    trie_node * next = bit?node->r:node->l;
+
+    if (next!=NULL) {
+      if (node->bitpos>=next->bitpos) {
+        if (keylen==(unsigned int)next->data[0] && strncmp(key, next->data+1, keylen)==0) {
+          return next;
+        }
+        next = NULL;
+      }
+    }
+    node = next;
+  }
+  return NULL;
+}
+
+trie_node * trie_find_prefix(trie_node *root, const char *key)
+{
+  trie_node * node = root;
+  size_t keylen = strlen(key);
+
+  while (node) {
+    int bit = get_bit(key, keylen, node->bitpos);
+    trie_node * next = bit?node->r:node->l;
+
+    if (next!=NULL) {
+      if (node->bitpos>=next->bitpos) {
+        if (keylen<=(unsigned int)next->data[0] && strncmp(key, next->data+1, keylen)==0) {
+          return next;
+        }
+        next = NULL;
+      }
+    }
+    node = next;
+  }
+  return NULL;
+}
+
+void * trie_getdata(trie_node * node)
+{
+  return node->data+2+node->data[0];
+}
+
+const char * trie_getkey(trie_node * node)
+{
+  return node->data+1;
+}
+
+void trie_free(trie_node * root)
+{
+  if (root) {
+    if (root->l && root->l->bitpos>root->bitpos) trie_free(root->l);
+    if (root->r && root->r->bitpos>root->bitpos) trie_free(root->r);
+    free(root);
+  }
+}
--- a/src/common/util/patricia.h
+++ b/src/common/util/patricia.h
@ -0,0 +1,21 @@
+#ifndef H_PATRICIA
+#define H_PATRICIA
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct trie_node trie_node;
+
+trie_node * trie_insert(trie_node **root, const char *key, const void *data, size_t size);
+trie_node * trie_find(trie_node *root, const char *key);
+void * trie_getdata(trie_node *node);
+const char * trie_getkey(trie_node *node);
+void trie_free(trie_node * root);
+void trie_remove(trie_node **root_p, trie_node *pos);
+void trie_debug(trie_node * root);
+trie_node * trie_find_prefix(trie_node *root, const char *key);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/src/common/util/umlaut.c
+++ b/src/common/util/umlaut.c
@ -32,7 +32,7 @@

 typedef struct tref {
  struct tref * nexthash;
-  wint_t ucs;
+  ucs4_t ucs;
  struct tnode * node;
 } tref;

@ -43,7 +43,7 @@ void
 addtoken(tnode * root, const char * str, variant id)
 {
  static struct replace {
-    wint_t ucs;
+    ucs4_t ucs;
    const char str[3];
  } replace[] = {
    /* match lower-case (!) umlauts and others to transcriptions */
@ -63,7 +63,7 @@ addtoken(tnode * root, const char * str, variant id)
  } else {
    tref * next;
    int ret, index, i = 0;
-    wint_t ucs, lcs;
+    ucs4_t ucs, lcs;
    size_t len;

    ret = unicode_utf8_to_ucs4(&ucs, str, &len);
@ -84,10 +84,10 @@ addtoken(tnode * root, const char * str, variant id)
      tnode * node = calloc(1, sizeof(tnode));

      if (ucs<'a' || ucs>'z') {
-        lcs = towlower(ucs);
+        lcs = towlower((wint_t)ucs);
      }
      if (ucs==lcs) {
-        ucs = towupper(ucs);
+        ucs = towupper((wint_t)ucs);
      }

      ref = malloc(sizeof(tref));
@ -136,7 +136,7 @@ findtoken(const tnode * tk, const char * str, variant* result)
  do {
    int index;
    const tref * ref;
-    wint_t ucs;
+    ucs4_t ucs;
    size_t len;
    int ret = unicode_utf8_to_ucs4(&ucs, str, &len);

--- a/src/common/util/unicode.c
+++ b/src/common/util/unicode.c
@ -14,13 +14,67 @@
 #include <errno.h>
 #include <wctype.h>

+#define B00000000 0x00
+#define B10000000 0x80
+#define B11000000 0xC0
+#define B11100000 0xE0
+#define B11110000 0xF0
+#define B11111000 0xF8
+#define B11111100 0xFC
+#define B11111110 0xFE
+
+#define B00111111 0x3F
+#define B00011111 0x1F
+#define B00001111 0x0F
+#define B00000111 0x07
+#define B00000011 0x03
+#define B00000001 0x01
+
 int
-unicode_latin1_to_utf8(unsigned char *out, size_t *outlen, const unsigned char *in, size_t *inlen)
+unicode_utf8_tolower(utf8_t *op, size_t outlen, const utf8_t *ip)
+{
+  while (*ip) {
+    ucs4_t ucs = *ip;
+    ucs4_t low;
+    size_t size = 1;
+
+    if (ucs & 0x80) {
+      int ret = unicode_utf8_to_ucs4(&ucs, ip, &size);
+      if (ret!=0) {
+        return ret;
+      }
+    }
+    if (size>outlen) {
+      return ENOMEM;
+    }
+    low = towlower((wint_t)ucs);
+    if (low==ucs) {
+      memcpy(op, ip, size);
+      ip += size;
+      op += size;
+      outlen -=size;
+    } else {
+      ip += size;
+      unicode_ucs4_to_utf8(op, &size, low);
+      op += size;
+      outlen -=size;
+    }
+  }
+
+  if (outlen<=0) {
+    return ENOMEM;
+  }
+  *op = 0;
+  return 0;
+}
+
+int
+unicode_latin1_to_utf8(utf8_t *out, size_t *outlen, const char *in, size_t *inlen)
 {
  int is = (int)*inlen;
  int os = (int)*outlen;
-  const unsigned char * ip = in;
-  unsigned char * op = out;
+  const char * ip = in;
+  utf8_t * op = out;

  while (ip-in<is) {
    unsigned char c = *ip;
@ -44,12 +98,12 @@ unicode_latin1_to_utf8(unsigned char *out, size_t *outlen, const unsigned char *
 }

 int
-unicode_utf8_strcasecmp(const char * a, const char * b)
+unicode_utf8_strcasecmp(const utf8_t * a, const char * b)
 {
  while (*a && *b) {
    int ret;
    size_t size;
-    wint_t ucsa = *a, ucsb = *b;
+    ucs4_t ucsa = *a, ucsb = *b;

    if (ucsa & 0x80) {
      ret = unicode_utf8_to_ucs4(&ucsa, a, &size);
@ -63,8 +117,8 @@ unicode_utf8_strcasecmp(const char * a, const char * b)
    } else ++b;

    if (ucsb!=ucsa) {
-      ucsb = towlower(ucsb);
-      ucsa = towlower(ucsa);
+      ucsb = towlower((wint_t)ucsb);
+      ucsa = towlower((wint_t)ucsa);
      if (ucsb<ucsa) return 1;
      if (ucsb>ucsa) return -1;
    }
@ -74,12 +128,73 @@ unicode_utf8_strcasecmp(const char * a, const char * b)
  return 0;
 }

+/* Convert a UCS-4 character to UTF-8. */
+int
+unicode_ucs4_to_utf8 (utf8_t *utf8_character, size_t *size, ucs4_t ucs4_character)
+{
+  int utf8_bytes;
+
+  if (ucs4_character <= 0x0000007F) {
+    /* 0xxxxxxx */
+    utf8_bytes = 1;
+    utf8_character[0] = (char) ucs4_character;
+  }
+  else if (ucs4_character <= 0x000007FF) {
+    /* 110xxxxx 10xxxxxx */
+    utf8_bytes = 2;
+    utf8_character[0] = (char) ((ucs4_character >> 6) | B11000000);
+    utf8_character[1] = (char) ((ucs4_character & B00111111) | B10000000);
+  }
+  else if (ucs4_character <= 0x0000FFFF) {
+    /* 1110xxxx 10xxxxxx 10xxxxxx */
+    utf8_bytes = 3;
+    utf8_character[0] = (char) ((ucs4_character >> 12) | B11100000);
+    utf8_character[1] = (char) (((ucs4_character >> 6) & B00111111) | B10000000);
+    utf8_character[2] = (char) ((ucs4_character & B00111111) | B10000000);
+  }
+  else if (ucs4_character <= 0x001FFFFF) {
+    /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+    utf8_bytes = 4;
+    utf8_character[0] = (char) ((ucs4_character >> 18) | B11110000);
+    utf8_character[1] = (char) (((ucs4_character >> 12) & B00111111) | B10000000);
+    utf8_character[2] = (char) (((ucs4_character >> 6) & B00111111) | B10000000);
+    utf8_character[3] = (char) ((ucs4_character & B00111111) | B10000000);
+  }
+  else if (ucs4_character <= 0x03FFFFFF) {
+    /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
+    utf8_bytes = 5;
+    utf8_character[0] = (char) ((ucs4_character >> 24) | B11111000);
+    utf8_character[1] = (char) (((ucs4_character >> 18) & B00111111) | B10000000);
+    utf8_character[2] = (char) (((ucs4_character >> 12) & B00111111) | B10000000);
+    utf8_character[3] = (char) (((ucs4_character >> 6) & B00111111) | B10000000);
+    utf8_character[4] = (char) ((ucs4_character & B00111111) | B10000000);
+  }
+  else if (ucs4_character <= 0x7FFFFFFF) {
+    /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
+    utf8_bytes = 6;
+    utf8_character[0] = (char) ((ucs4_character >> 30) | B11111100);
+    utf8_character[1] = (char) (((ucs4_character >> 24) & B00111111) | B10000000);
+    utf8_character[2] = (char) (((ucs4_character >> 18) & B00111111) | B10000000);
+    utf8_character[3] = (char) (((ucs4_character >> 12) & B00111111) | B10000000);
+    utf8_character[4] = (char) (((ucs4_character >> 6) & B00111111) | B10000000);
+    utf8_character[5] = (char) ((ucs4_character & B00111111) | B10000000);
+  }
+  else {
+    return EILSEQ;
+  }
+
+  *size = utf8_bytes;
+
+  return 0;
+}
+
+
 /* Convert a UTF-8 encoded character to UCS-4. */
 int
-unicode_utf8_to_ucs4(wint_t *ucs4_character, const char *utf8_string, 
+unicode_utf8_to_ucs4(ucs4_t *ucs4_character, const utf8_t *utf8_string, 
                     size_t *length)
 {
-  unsigned char utf8_character = (unsigned char)utf8_string[0];
+  utf8_t utf8_character = utf8_string[0];

  /* Is the character in the ASCII range? If so, just copy it to the
  output. */
@ -202,10 +317,10 @@ unicode_utf8_to_ucs4(wint_t *ucs4_character, const char *utf8_string,

 /** Convert a UTF-8 encoded character to CP437. */
 int
-unicode_utf8_to_cp437(char *cp_character, const char *utf8_string, 
+unicode_utf8_to_cp437(char *cp_character, const utf8_t *utf8_string, 
                     size_t *length)
 {
-  wint_t ucs4_character;
+  ucs4_t ucs4_character;
  int result;

  result = unicode_utf8_to_ucs4(&ucs4_character, utf8_string, length);
@ -217,7 +332,7 @@ unicode_utf8_to_cp437(char *cp_character, const char *utf8_string,
  if (ucs4_character<0x7F) {
    *cp_character = (char)ucs4_character;
  } else {
-    struct { wint_t ucs4; unsigned char cp437; } xref[160] = {
+    struct { ucs4_t ucs4; unsigned char cp437; } xref[160] = {
      {0x00A0, 255}, {0x00A1, 173}, {0x00A2, 155}, {0x00A3, 156}, 
      {0x00A5, 157}, {0x00A7,  21}, {0x00AA, 166}, {0x00AB, 174}, 
      {0x00AC, 170}, {0x00B0, 248}, {0x00B1, 241}, {0x00B2, 253}, 
@ -278,10 +393,10 @@ unicode_utf8_to_cp437(char *cp_character, const char *utf8_string,

 /** Convert a UTF-8 encoded character to CP1252. */
 int
-unicode_utf8_to_cp1252(char *cp_character, const char *utf8_string, 
+unicode_utf8_to_cp1252(char *cp_character, const utf8_t *utf8_string, 
                       size_t *length)
 {
-  wint_t ucs4_character;
+  ucs4_t ucs4_character;
  int result;

  result = unicode_utf8_to_ucs4(&ucs4_character, utf8_string, length);
@ -293,7 +408,7 @@ unicode_utf8_to_cp1252(char *cp_character, const char *utf8_string,
  if (ucs4_character<=0x7F || ucs4_character>=0xA0) {
    *cp_character = (char)ucs4_character;
  } else {
-    struct { wint_t ucs4; unsigned char cp; } xref[] = {
+    struct { ucs4_t ucs4; unsigned char cp; } xref[] = {
      {0x20ac, 0x80}, {0x0081, 0x81}, {0x201a, 0x82}, {0x0192, 0x83},
      {0x201e, 0x84}, {0x2026, 0x85}, {0x2020, 0x86}, {0x2021, 0x87},
      {0x02c6, 0x88}, {0x2030, 0x89}, {0x0160, 0x8a}, {0x2039, 0x8b},
--- a/src/common/util/unicode.h
+++ b/src/common/util/unicode.h
@ -21,11 +21,16 @@ extern "C" {

 #include <wchar.h>
 #define USE_UNICODE
-  extern int unicode_utf8_to_cp437(char *ucs4_character, const char *utf8_string, size_t *length);
-  extern int unicode_utf8_to_cp1252(char *ucs4_character, const char *utf8_string, size_t *length);
-  extern int unicode_utf8_to_ucs4(wint_t *ucs4_character, const char *utf8_string, size_t *length);
-  extern int unicode_utf8_strcasecmp(const char * a, const char * b);
-  extern int unicode_latin1_to_utf8(unsigned char *out, size_t *outlen, const unsigned char *in, size_t *inlen);
+  typedef unsigned long ucs4_t;
+  typedef char utf8_t;
+
+  extern int unicode_utf8_to_cp437(char *result, const utf8_t *utf8_string, size_t *length);
+  extern int unicode_utf8_to_cp1252(char *result, const utf8_t *utf8_string, size_t *length);
+  extern int unicode_utf8_to_ucs4(ucs4_t *result, const utf8_t *utf8_string, size_t *length);
+  extern int unicode_ucs4_to_utf8 (utf8_t *result, size_t *size, ucs4_t ucs4_character);
+  extern int unicode_utf8_strcasecmp(const utf8_t * a, const utf8_t * b);
+  extern int unicode_latin1_to_utf8(utf8_t *out, size_t *outlen, const char *in, size_t *inlen);
+  extern int unicode_utf8_tolower(utf8_t *out, size_t outlen, const utf8_t *in);

 #ifdef __cplusplus
 }
--- a/src/eressea/Jamfile
+++ b/src/eressea/Jamfile
@ -34,6 +34,7 @@ SHARED_BINDINGS =
 <lua>spell.cpp
 <lua>unit.cpp
 <lua>item.cpp
+ <lua>test.cpp
 ;

 Library luabindings : $(SHARED_BINDINGS) ;
--- a/src/eressea/lua/bindings.h
+++ b/src/eressea/lua/bindings.h
@ -16,6 +16,9 @@ extern void bind_event(struct lua_State * L);
 extern void bind_message(struct lua_State * L);
 extern void bind_objects(struct lua_State * L);

+/* test routines */
+extern void bind_test(struct lua_State * L);
+
 /* server only */
 extern void bind_script(struct lua_State * L);
 extern void bind_gamecode(struct lua_State * L);
--- a/src/eressea/lua/eressea.cpp
+++ b/src/eressea/lua/eressea.cpp
@ -116,8 +116,11 @@ lua_setstring(const char * lname, const char * key, const char * str)
 static const char *
 lua_getstring(const char * lname, const char * key)
 {
-  struct locale * lang = find_locale(lname);
-  return (const char*)locale_getstring(lang, key);
+  if (key) {
+    struct locale * lang = find_locale(lname);
+    return (const char*)locale_getstring(lang, key);
+  }
+  return NULL;
 }

 #define ISLANDSIZE 20
--- a/src/eressea/lua/test.cpp
+++ b/src/eressea/lua/test.cpp
@ -0,0 +1,49 @@
+#include <config.h>
+#include <kernel/eressea.h>
+
+#include "bindings.h"
+#include "list.h"
+
+// Lua includes
+#ifdef _MSC_VER
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif
+#include <lua.hpp>
+#include <luabind/luabind.hpp>
+#include <luabind/iterator_policy.hpp>
+#ifdef _MSC_VER
+#pragma warning (pop)
+#endif
+
+using namespace luabind;
+
+#include <util/language.h>
+#include <kernel/skill.h>
+
+static const char *
+loc_getskill(const char * loc, const char * locstring)
+{
+  struct locale * lang = find_locale(loc);
+  skill_t result = findskill(locstring, lang);
+  if (result==NOSKILL) return 0;
+  return skillnames[result];
+}
+
+static const char *
+loc_getkeyword(const char * loc, const char * locstring)
+{
+  struct locale * lang = find_locale(loc);
+  keyword_t result = findkeyword(locstring, lang);
+  if (result==NOKEYWORD) return 0;
+  return keywords[result];
+}
+
+void
+bind_test(lua_State * L)
+{
+  module(L, "test")[
+    def("loc_skill", &loc_getskill),
+    def("loc_keyword", &loc_getkeyword)
+  ];
+}
--- a/src/eressea/server.cpp
+++ b/src/eressea/server.cpp
@ -19,11 +19,6 @@
 * permission from the authors.
 */

-#define LOCALE_CHECK
-#ifdef __LCC__
-#undef LOCALE_CHECK
-#endif
-
 /* config includes */
 #include <config.h>
 #include <kernel/eressea.h>
@ -312,7 +307,9 @@ lua_init(void)
  bind_event(L);
  bind_message(L);
  bind_gamecode(L);
+
  bind_gmtool(L);
+  bind_test(L);
  return L;
 }

@ -662,12 +659,6 @@ main(int argc, char *argv[])
  lc_numeric = setlocale(LC_NUMERIC, "C");
  if (lc_ctype) lc_ctype = strdup(lc_ctype);
  if (lc_numeric) lc_numeric = strdup(lc_numeric);
-#ifdef LOCALE_CHECK
-  if (!locale_check()) {
-    log_error(("The current locale is not suitable for international Eressea.\n"));
-    return -1;
-  }
-#endif
  
  lua_State * luaState = lua_init();
  global.vm_state = luaState;
--- a/src/scripts/run-tests.lua
+++ b/src/scripts/run-tests.lua
@ -0,0 +1,13 @@
+-- -*- coding: utf-8 -*-
+
+function test_locales()
+	local skills = { "", "herb", "kraut", "Kräute", "Kraeut", "k", "kra", "MAGIE" }
+	for k,v in pairs(skills) do
+		str = test.loc_skill("de", v)
+		io.stdout:write(v, "\t", tostring(str), "  ", tostring(get_string("de", "skill::" .. tostring(str))), "\n")
+	end
+	return 0
+end
+
+test_locales()
+io.stdin:read("*line")