server/src/common/util/parser.c

192 lines
4 KiB
C

#include <config.h>
#include "parser.h"
#include "unicode.h"
#include "log.h"
#include <assert.h>
#include <wctype.h>
#include <memory.h>
#define SPACE_REPLACEMENT '~'
#define ESCAPE_CHAR '\\'
#define MAXTOKENSIZE 8192
typedef struct parser_state {
const xmlChar *current_token;
xmlChar * current_cmd;
struct parser_state * next;
} parser_state;
static parser_state * state;
static int
eatwhitespace_c(const xmlChar ** str)
{
int ret;
wint_t ucs;
size_t len;
/* skip over potential whitespace */
for (;;) {
xmlChar utf8_character = (*str)[0];
if (utf8_character <= 0x7F) {
if (!iswspace(utf8_character)) break;
++*str;
} else {
ret = unicode_utf8_to_ucs4(&ucs, *str, &len);
if (ret!=0) {
log_warning(("illegal character sequence in UTF8 string: %s\n", *str));
return ret;
}
if (!iswspace(ucs)) break;
*str+=len;
}
}
return 0;
}
void
init_tokens_str(const xmlChar * initstr, xmlChar * cmd)
{
if (state==NULL) {
state = malloc(sizeof(parser_state));
}
else if (state->current_cmd) free(state->current_cmd);
state->current_cmd = cmd;
state->current_token = (const xmlChar *)initstr;
}
void
parser_pushstate(void)
{
parser_state * new_state = malloc(sizeof(parser_state));
new_state->current_cmd = NULL;
new_state->current_token = NULL;
new_state->next = state;
state = new_state;
}
void
parser_popstate(void)
{
parser_state * new_state = state->next;
if (state->current_cmd!=NULL) free(state->current_cmd);
free(state);
state = new_state;
}
boolean
parser_end(void)
{
eatwhitespace_c(&state->current_token);
return *state->current_token == 0;
}
void
skip_token(void)
{
char quotechar = 0;
eatwhitespace_c(&state->current_token);
while (*state->current_token) {
wint_t ucs;
size_t len;
xmlChar utf8_character = state->current_token[0];
if (utf8_character <= 0x7F) {
ucs = utf8_character;
++state->current_token;
} else {
int ret = unicode_utf8_to_ucs4(&ucs, state->current_token, &len);
if (ret==0) {
state->current_token+=len;
} else {
log_warning(("illegal character sequence in UTF8 string: %s\n", state->current_token));
}
}
if (iswspace(ucs) && quotechar==0) {
return;
} else {
switch(utf8_character) {
case '"':
case '\'':
if (utf8_character==quotechar) return;
quotechar = utf8_character;
break;
case ESCAPE_CHAR:
++state->current_token;
break;
}
}
}
}
const xmlChar *
parse_token(const xmlChar ** str)
{
static xmlChar lbuf[MAXTOKENSIZE];
xmlChar * cursor = lbuf;
char quotechar = 0;
boolean escape = false;
const xmlChar * ctoken = *str;
assert(ctoken);
eatwhitespace_c(&ctoken);
while (*ctoken && cursor-lbuf < MAXTOKENSIZE-1) {
wint_t ucs;
size_t len;
boolean copy = false;
xmlChar utf8_character = *ctoken;
if (utf8_character <= 0x7F) {
ucs = utf8_character;
len = 1;
} else {
int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);
if (ret!=0) {
log_warning(("illegal character sequence in UTF8 string: %s\n", ctoken));
break;
}
}
if (escape) {
copy = true;
} else if (iswspace(ucs)) {
if (quotechar==0) break;
copy = true;
} else if (utf8_character=='"' || utf8_character=='\'') {
if (utf8_character==quotechar) {
++ctoken;
break;
} else if (quotechar==0) {
quotechar = utf8_character;
++ctoken;
} else {
*cursor++ = *ctoken++;
}
} else if (utf8_character==SPACE_REPLACEMENT) {
*cursor++ = ' ';
++ctoken;
} else if (utf8_character==ESCAPE_CHAR) {
escape = true;
++ctoken;
} else {
copy = true;
}
if (copy) {
memcpy(cursor, ctoken, len);
cursor+=len;
ctoken+=len;
}
}
*cursor = '\0';
*str = ctoken;
return lbuf;
}
const xmlChar *
getstrtoken(void)
{
return parse_token((const xmlChar**)&state->current_token);
}