2001-01-25 10:37:55 +01:00
|
|
|
|
/* vi: set ts=2:
|
2007-06-26 11:32:28 +02:00
|
|
|
|
*
|
|
|
|
|
*
|
|
|
|
|
* Eressea PB(E)M host Copyright (C) 1998-2003
|
|
|
|
|
* Christian Schlittchen (corwin@amber.kn-bremen.de)
|
|
|
|
|
* Katja Zedel (katze@felidae.kn-bremen.de)
|
|
|
|
|
* Henning Peters (faroul@beyond.kn-bremen.de)
|
|
|
|
|
* Enno Rehling (enno@eressea-pbem.de)
|
|
|
|
|
* Ingo Wilken (Ingo.Wilken@informatik.uni-oldenburg.de)
|
|
|
|
|
*
|
|
|
|
|
* based on:
|
|
|
|
|
*
|
|
|
|
|
* Atlantis v1.0 13 September 1993 Copyright 1993 by Russell Wallace
|
|
|
|
|
* Atlantis v1.7 Copyright 1996 by Alex Schr<EFBFBD>der
|
|
|
|
|
*
|
|
|
|
|
* This program may not be used, modified or distributed without
|
|
|
|
|
* prior permission by the authors of Eressea.
|
|
|
|
|
* This program may not be sold or used commercially without prior written
|
|
|
|
|
* permission from the authors.
|
|
|
|
|
*/
|
2001-01-25 10:37:55 +01:00
|
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
|
#include "umlaut.h"
|
|
|
|
|
|
2007-06-26 11:32:28 +02:00
|
|
|
|
#include "log.h"
|
|
|
|
|
#include "unicode.h"
|
|
|
|
|
|
2007-06-26 11:51:18 +02:00
|
|
|
|
#include <wctype.h>
|
2001-01-25 10:37:55 +01:00
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
|
2001-02-15 03:41:47 +01:00
|
|
|
|
typedef struct tref {
|
2007-06-26 11:32:28 +02:00
|
|
|
|
struct tref * nexthash;
|
|
|
|
|
wint_t ucs;
|
|
|
|
|
struct tnode * node;
|
2001-02-15 03:41:47 +01:00
|
|
|
|
} tref;
|
|
|
|
|
|
|
|
|
|
#define LEAF 1 /* leaf node for a word. always matches */
|
|
|
|
|
#define SHARED 2 /* at least two words share the node */
|
|
|
|
|
|
2001-01-25 10:37:55 +01:00
|
|
|
|
void
|
2007-08-10 09:03:23 +02:00
|
|
|
|
addtoken(tnode * root, const char * str, variant id)
|
2001-01-25 10:37:55 +01:00
|
|
|
|
{
|
2007-06-26 11:32:28 +02:00
|
|
|
|
static struct replace {
|
|
|
|
|
wint_t ucs;
|
|
|
|
|
const char str[3];
|
|
|
|
|
} replace[] = {
|
|
|
|
|
/* match lower-case (!) umlauts and others to transcriptions */
|
|
|
|
|
{ 228, "AE"},
|
|
|
|
|
{ 246, "OE"},
|
|
|
|
|
{ 252, "UE"},
|
|
|
|
|
{ 223, "SS"},
|
|
|
|
|
{ 230, "AE"},
|
|
|
|
|
{ 248, "OE"},
|
|
|
|
|
{ 229, "AA"},
|
2007-06-26 11:51:18 +02:00
|
|
|
|
{ 0, "" }
|
2007-06-26 11:32:28 +02:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (!*str) {
|
|
|
|
|
root->id = id;
|
|
|
|
|
root->flags |= LEAF;
|
|
|
|
|
} else {
|
|
|
|
|
tref * next;
|
|
|
|
|
int ret, index, i = 0;
|
|
|
|
|
wint_t ucs, lcs;
|
|
|
|
|
size_t len;
|
|
|
|
|
|
|
|
|
|
ret = unicode_utf8_to_ucs4(&ucs, str, &len);
|
|
|
|
|
assert(ret==0 || !"invalid utf8 string");
|
|
|
|
|
lcs = ucs;
|
|
|
|
|
|
2006-01-25 21:40:19 +01:00
|
|
|
|
#if NODEHASHSIZE == 8
|
2007-06-26 11:32:28 +02:00
|
|
|
|
index = ucs & 7;
|
2006-01-24 22:44:20 +01:00
|
|
|
|
#else
|
2007-06-26 11:32:28 +02:00
|
|
|
|
index = ucs % NODEHASHSIZE;
|
2006-01-24 22:44:20 +01:00
|
|
|
|
#endif
|
2007-06-26 11:32:28 +02:00
|
|
|
|
next = root->next[index];
|
|
|
|
|
if (!(root->flags & LEAF)) root->id = id;
|
|
|
|
|
while (next && next->ucs != ucs) next = next->nexthash;
|
|
|
|
|
if (!next) {
|
|
|
|
|
tref * ref;
|
|
|
|
|
tnode * node = calloc(1, sizeof(tnode));
|
|
|
|
|
|
|
|
|
|
if (ucs<'a' || ucs>'z') {
|
|
|
|
|
lcs = towlower(ucs);
|
|
|
|
|
}
|
|
|
|
|
if (ucs==lcs) {
|
|
|
|
|
ucs = towupper(ucs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ref = malloc(sizeof(tref));
|
|
|
|
|
ref->ucs = ucs;
|
|
|
|
|
ref->node = node;
|
|
|
|
|
ref->nexthash=root->next[index];
|
|
|
|
|
root->next[index] = ref;
|
2001-02-15 03:41:47 +01:00
|
|
|
|
|
2007-06-26 11:32:28 +02:00
|
|
|
|
/* try lower/upper casing the character, and try again */
|
|
|
|
|
if (ucs!=lcs) {
|
2006-01-25 21:40:19 +01:00
|
|
|
|
#if NODEHASHSIZE == 8
|
2007-06-26 11:32:28 +02:00
|
|
|
|
index = lcs & 7;
|
2006-01-25 21:40:19 +01:00
|
|
|
|
#else
|
2007-06-26 11:32:28 +02:00
|
|
|
|
index = lcs % NODEHASHSIZE;
|
2006-01-25 21:40:19 +01:00
|
|
|
|
#endif
|
2007-06-26 11:32:28 +02:00
|
|
|
|
ref = malloc(sizeof(tref));
|
|
|
|
|
ref->ucs = lcs;
|
|
|
|
|
ref->node = node;
|
|
|
|
|
ref->nexthash = root->next[index];
|
|
|
|
|
root->next[index] = ref;
|
|
|
|
|
}
|
|
|
|
|
next=ref;
|
|
|
|
|
} else {
|
|
|
|
|
next->node->flags |= SHARED;
|
|
|
|
|
if ((next->node->flags & LEAF) == 0) next->node->id.v = NULL; /* why?*/
|
|
|
|
|
}
|
|
|
|
|
addtoken(next->node, str+len, id);
|
|
|
|
|
while (replace[i].str[0]) {
|
|
|
|
|
if (lcs==replace[i].ucs) {
|
2007-08-10 09:03:23 +02:00
|
|
|
|
char zText[1024];
|
2007-06-26 11:32:28 +02:00
|
|
|
|
memcpy(zText, replace[i].str, 3);
|
2007-08-10 09:03:23 +02:00
|
|
|
|
strcpy(zText+2, (const char*)str+len);
|
2007-06-26 11:32:28 +02:00
|
|
|
|
addtoken(root, zText, id);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
++i;
|
|
|
|
|
}
|
|
|
|
|
}
|
2001-01-25 10:37:55 +01:00
|
|
|
|
}
|
|
|
|
|
|
2001-02-15 03:41:47 +01:00
|
|
|
|
int
|
2007-08-10 09:03:23 +02:00
|
|
|
|
findtoken(const tnode * tk, const char * str, variant* result)
|
2001-01-25 10:37:55 +01:00
|
|
|
|
{
|
2007-06-26 11:32:28 +02:00
|
|
|
|
if (!str || *str==0) return E_TOK_NOMATCH;
|
2001-01-25 10:37:55 +01:00
|
|
|
|
|
2007-06-26 11:32:28 +02:00
|
|
|
|
do {
|
|
|
|
|
int index;
|
|
|
|
|
const tref * ref;
|
|
|
|
|
wint_t ucs;
|
|
|
|
|
size_t len;
|
|
|
|
|
int ret = unicode_utf8_to_ucs4(&ucs, str, &len);
|
2001-02-15 03:41:47 +01:00
|
|
|
|
|
2007-06-26 11:32:28 +02:00
|
|
|
|
if (ret!=0) {
|
|
|
|
|
/* encoding is broken. youch */
|
|
|
|
|
return E_TOK_NOMATCH;
|
|
|
|
|
}
|
2006-01-25 21:40:19 +01:00
|
|
|
|
#if NODEHASHSIZE == 8
|
2007-06-26 11:32:28 +02:00
|
|
|
|
index = ucs & 7;
|
2006-01-24 22:44:20 +01:00
|
|
|
|
#else
|
2007-06-26 11:32:28 +02:00
|
|
|
|
index = ucs % NODEHASHSIZE;
|
2006-01-24 22:44:20 +01:00
|
|
|
|
#endif
|
2007-06-26 11:32:28 +02:00
|
|
|
|
ref = tk->next[index];
|
|
|
|
|
while (ref && ref->ucs!=ucs) ref = ref->nexthash;
|
|
|
|
|
str+=len;
|
|
|
|
|
if (!ref) return E_TOK_NOMATCH;
|
|
|
|
|
tk = ref->node;
|
|
|
|
|
} while (*str);
|
|
|
|
|
if (tk) {
|
|
|
|
|
*result = tk->id;
|
|
|
|
|
return E_TOK_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
return E_TOK_NOMATCH;
|
2001-01-25 10:37:55 +01:00
|
|
|
|
}
|