server/src/util/parser.c

256 lines
5.5 KiB
C
Raw Normal View History

2010-08-08 10:06:34 +02:00
#include <platform.h>
#include "parser.h"
#include "unicode.h"
2014-12-18 07:09:22 +01:00
#include "base36.h"
2010-08-08 10:06:34 +02:00
#include "log.h"
#include <assert.h>
#include <stdlib.h>
2010-08-08 10:06:34 +02:00
#include <wctype.h>
#include <memory.h>
#define SPACE_REPLACEMENT '~'
#define ESCAPE_CHAR '\\'
#define MAXTOKENSIZE 8192
typedef struct parser_state {
const char *current_token;
2011-03-07 08:02:35 +01:00
struct parser_state *next;
2010-08-08 10:06:34 +02:00
} parser_state;
static parser_state *states;
2010-08-08 10:06:34 +02:00
2011-03-07 08:02:35 +01:00
static int eatwhitespace_c(const char **str_p)
2010-08-08 10:06:34 +02:00
{
int ret = 0;
ucs4_t ucs;
size_t len;
2011-03-07 08:02:35 +01:00
const char *str = *str_p;
2010-08-08 10:06:34 +02:00
/* skip over potential whitespace */
for (;;) {
unsigned char utf8_character = (unsigned char)*str;
if (~utf8_character & 0x80) {
2011-03-07 08:02:35 +01:00
if (!iswxspace(utf8_character))
break;
2010-08-08 10:06:34 +02:00
++str;
} else {
ret = unicode_utf8_to_ucs4(&ucs, str, &len);
2011-03-07 08:02:35 +01:00
if (ret != 0) {
log_warning("illegal character sequence in UTF8 string: %s\n", str);
2010-08-08 10:06:34 +02:00
break;
}
2011-03-07 08:02:35 +01:00
if (!iswxspace((wint_t) ucs))
break;
str += len;
2010-08-08 10:06:34 +02:00
}
}
*str_p = str;
return ret;
}
void init_tokens_str(const char *initstr)
2010-08-08 10:06:34 +02:00
{
if (states == NULL) {
states = malloc(sizeof(parser_state));
}
states->current_token = initstr;
2010-08-08 10:06:34 +02:00
}
2011-03-07 08:02:35 +01:00
void parser_pushstate(void)
2010-08-08 10:06:34 +02:00
{
2011-03-07 08:02:35 +01:00
parser_state *new_state = malloc(sizeof(parser_state));
2010-08-08 10:06:34 +02:00
new_state->current_token = NULL;
new_state->next = states;
states = new_state;
2010-08-08 10:06:34 +02:00
}
2011-03-07 08:02:35 +01:00
void parser_popstate(void)
2010-08-08 10:06:34 +02:00
{
parser_state *new_state = states->next;
free(states);
states = new_state;
2010-08-08 10:06:34 +02:00
}
bool parser_end(void)
2010-08-08 10:06:34 +02:00
{
if (states->current_token) {
eatwhitespace_c(&states->current_token);
return *states->current_token == 0;
}
return true;
2010-08-08 10:06:34 +02:00
}
2011-03-07 08:02:35 +01:00
void skip_token(void)
2010-08-08 10:06:34 +02:00
{
char quotechar = 0;
eatwhitespace_c(&states->current_token);
2010-08-08 10:06:34 +02:00
while (*states->current_token) {
2010-08-08 10:06:34 +02:00
ucs4_t ucs;
size_t len;
unsigned char utf8_character = (unsigned char)states->current_token[0];
2010-08-08 10:06:34 +02:00
if (~utf8_character & 0x80) {
ucs = utf8_character;
++states->current_token;
2010-08-08 10:06:34 +02:00
} else {
int ret = unicode_utf8_to_ucs4(&ucs, states->current_token, &len);
2011-03-07 08:02:35 +01:00
if (ret == 0) {
states->current_token += len;
2010-08-08 10:06:34 +02:00
} else {
log_warning("illegal character sequence in UTF8 string: %s\n", states->current_token);
2010-08-08 10:06:34 +02:00
}
}
2011-03-07 08:02:35 +01:00
if (iswxspace((wint_t) ucs) && quotechar == 0) {
2010-08-08 10:06:34 +02:00
return;
} else {
2011-03-07 08:02:35 +01:00
switch (utf8_character) {
2010-08-08 10:06:34 +02:00
case '"':
case '\'':
2011-03-07 08:02:35 +01:00
if (utf8_character == quotechar)
return;
2010-08-08 10:06:34 +02:00
quotechar = utf8_character;
break;
case ESCAPE_CHAR:
++states->current_token;
2010-08-08 10:06:34 +02:00
break;
}
}
}
}
char *parse_token(const char **str, char *lbuf, size_t len)
2010-08-08 10:06:34 +02:00
{
char *cursor = lbuf;
char quotechar = 0;
bool escape = false;
const char *ctoken = *str;
2010-08-08 10:06:34 +02:00
if (!ctoken) {
return 0;
2010-08-08 10:06:34 +02:00
}
eatwhitespace_c(&ctoken);
if (!*ctoken) {
if (len > 0) {
*cursor = 0;
}
return 0;
2010-08-08 10:06:34 +02:00
}
while (*ctoken && cursor-len < lbuf-1) {
ucs4_t ucs;
size_t len;
bool copy = false;
unsigned char utf8_character = *(unsigned char *)ctoken;
if (~utf8_character & 0x80) {
ucs = utf8_character;
len = 1;
}
else {
int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);
if (ret != 0) {
log_warning("illegal character sequence in UTF8 string: %s\n", ctoken);
break;
}
}
if (escape) {
copy = true;
escape = false;
}
else if (iswxspace((wint_t)ucs)) {
if (quotechar == 0)
break;
copy = true;
}
else if (utf8_character == '"' || utf8_character == '\'') {
if (utf8_character == quotechar) {
++ctoken;
break;
}
else if (quotechar == 0) {
quotechar = utf8_character;
++ctoken;
}
else {
*cursor++ = *ctoken++;
}
}
else if (utf8_character == SPACE_REPLACEMENT) {
*cursor++ = ' ';
++ctoken;
}
else if (utf8_character == ESCAPE_CHAR) {
escape = true;
++ctoken;
}
else {
copy = true;
}
if (copy) {
memcpy(cursor, ctoken, len);
cursor += len;
ctoken += len;
}
2010-08-08 10:06:34 +02:00
}
assert(cursor - len < lbuf - 1); // TODO: handle too-small buffers
*cursor = '\0';
*str = ctoken;
return lbuf;
}
const char *parse_token_depr(const char **str)
{
static char lbuf[MAXTOKENSIZE]; /* STATIC_RESULT: used for return, not across calls */
return parse_token(str, lbuf, MAXTOKENSIZE);
2010-08-08 10:06:34 +02:00
}
2011-03-07 08:02:35 +01:00
const char *getstrtoken(void)
2010-08-08 10:06:34 +02:00
{
char lbuf[MAXTOKENSIZE];
return parse_token((const char **)&states->current_token, lbuf, MAXTOKENSIZE);
}
const char *gettoken(char *lbuf, size_t bufsize)
{
return parse_token((const char **)&states->current_token, lbuf, bufsize);
2010-08-08 10:06:34 +02:00
}
2014-12-18 07:09:22 +01:00
int getint(void)
{
char token[16];
const char * s = gettoken(token, sizeof(token));
return s ? atoi(s) : 0;
}
unsigned int getuint(void)
{
int n = getint();
return (n < 0) ? 0 : n;
}
2014-12-18 07:09:22 +01:00
int getid(void)
{
char token[16];
const char *str = gettoken(token, sizeof(token));
2014-12-18 07:09:22 +01:00
int i = str ? atoi36(str) : 0;
if (i < 0) {
return -1;
}
return i;
}
unsigned int atoip(const char *s)
{
int n;
assert(s);
n = atoi(s);
if (n < 0)
n = 0;
return n;
}