2010-08-08 10:06:34 +02:00
|
|
|
#include <platform.h>
|
|
|
|
#include "parser.h"
|
|
|
|
#include "unicode.h"
|
|
|
|
#include "log.h"
|
|
|
|
|
|
|
|
#include <assert.h>
|
|
|
|
#include <wctype.h>
|
|
|
|
#include <memory.h>
|
|
|
|
|
|
|
|
#define SPACE_REPLACEMENT '~'
|
|
|
|
#define ESCAPE_CHAR '\\'
|
|
|
|
#define MAXTOKENSIZE 8192
|
|
|
|
|
|
|
|
typedef struct parser_state {
|
|
|
|
const char *current_token;
|
2011-03-07 08:02:35 +01:00
|
|
|
char *current_cmd;
|
|
|
|
struct parser_state *next;
|
2010-08-08 10:06:34 +02:00
|
|
|
} parser_state;
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
static parser_state *state;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
static int eatwhitespace_c(const char **str_p)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
ucs4_t ucs;
|
|
|
|
size_t len;
|
2011-03-07 08:02:35 +01:00
|
|
|
const char *str = *str_p;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
|
|
|
/* skip over potential whitespace */
|
|
|
|
for (;;) {
|
|
|
|
unsigned char utf8_character = (unsigned char)*str;
|
|
|
|
if (~utf8_character & 0x80) {
|
2011-03-07 08:02:35 +01:00
|
|
|
if (!iswxspace(utf8_character))
|
|
|
|
break;
|
2010-08-08 10:06:34 +02:00
|
|
|
++str;
|
|
|
|
} else {
|
|
|
|
ret = unicode_utf8_to_ucs4(&ucs, str, &len);
|
2011-03-07 08:02:35 +01:00
|
|
|
if (ret != 0) {
|
2012-05-17 00:52:37 +02:00
|
|
|
log_warning("illegal character sequence in UTF8 string: %s\n", str);
|
2010-08-08 10:06:34 +02:00
|
|
|
break;
|
|
|
|
}
|
2011-03-07 08:02:35 +01:00
|
|
|
if (!iswxspace((wint_t) ucs))
|
|
|
|
break;
|
|
|
|
str += len;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
*str_p = str;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
void init_tokens_str(const char *initstr, char *cmd)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2011-03-07 08:02:35 +01:00
|
|
|
if (state == NULL) {
|
2010-08-08 10:06:34 +02:00
|
|
|
state = malloc(sizeof(parser_state));
|
2011-03-07 08:02:35 +01:00
|
|
|
} else if (state->current_cmd)
|
|
|
|
free(state->current_cmd);
|
2010-08-08 10:06:34 +02:00
|
|
|
state->current_cmd = cmd;
|
|
|
|
state->current_token = initstr;
|
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
void parser_pushstate(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2011-03-07 08:02:35 +01:00
|
|
|
parser_state *new_state = malloc(sizeof(parser_state));
|
2010-08-08 10:06:34 +02:00
|
|
|
new_state->current_cmd = NULL;
|
|
|
|
new_state->current_token = NULL;
|
|
|
|
new_state->next = state;
|
|
|
|
state = new_state;
|
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
void parser_popstate(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2011-03-07 08:02:35 +01:00
|
|
|
parser_state *new_state = state->next;
|
|
|
|
if (state->current_cmd != NULL)
|
|
|
|
free(state->current_cmd);
|
2010-08-08 10:06:34 +02:00
|
|
|
free(state);
|
|
|
|
state = new_state;
|
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
boolean parser_end(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
|
|
|
eatwhitespace_c(&state->current_token);
|
|
|
|
return *state->current_token == 0;
|
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
void skip_token(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
|
|
|
char quotechar = 0;
|
|
|
|
eatwhitespace_c(&state->current_token);
|
|
|
|
|
|
|
|
while (*state->current_token) {
|
|
|
|
ucs4_t ucs;
|
|
|
|
size_t len;
|
|
|
|
|
|
|
|
unsigned char utf8_character = (unsigned char)state->current_token[0];
|
|
|
|
if (~utf8_character & 0x80) {
|
|
|
|
ucs = utf8_character;
|
|
|
|
++state->current_token;
|
|
|
|
} else {
|
|
|
|
int ret = unicode_utf8_to_ucs4(&ucs, state->current_token, &len);
|
2011-03-07 08:02:35 +01:00
|
|
|
if (ret == 0) {
|
|
|
|
state->current_token += len;
|
2010-08-08 10:06:34 +02:00
|
|
|
} else {
|
2012-05-17 00:52:37 +02:00
|
|
|
log_warning("illegal character sequence in UTF8 string: %s\n", state->current_token);
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
}
|
2011-03-07 08:02:35 +01:00
|
|
|
if (iswxspace((wint_t) ucs) && quotechar == 0) {
|
2010-08-08 10:06:34 +02:00
|
|
|
return;
|
|
|
|
} else {
|
2011-03-07 08:02:35 +01:00
|
|
|
switch (utf8_character) {
|
2010-08-08 10:06:34 +02:00
|
|
|
case '"':
|
|
|
|
case '\'':
|
2011-03-07 08:02:35 +01:00
|
|
|
if (utf8_character == quotechar)
|
|
|
|
return;
|
2010-08-08 10:06:34 +02:00
|
|
|
quotechar = utf8_character;
|
|
|
|
break;
|
|
|
|
case ESCAPE_CHAR:
|
|
|
|
++state->current_token;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
const char *parse_token(const char **str)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2011-03-07 08:02:35 +01:00
|
|
|
static char lbuf[MAXTOKENSIZE]; /* STATIC_RESULT: used for return, not across calls */
|
|
|
|
char *cursor = lbuf;
|
2010-08-08 10:06:34 +02:00
|
|
|
char quotechar = 0;
|
|
|
|
boolean escape = false;
|
2011-03-07 08:02:35 +01:00
|
|
|
const char *ctoken = *str;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
|
|
|
assert(ctoken);
|
|
|
|
|
|
|
|
eatwhitespace_c(&ctoken);
|
2011-03-07 08:02:35 +01:00
|
|
|
while (*ctoken && cursor - lbuf < MAXTOKENSIZE - 1) {
|
2010-08-08 10:06:34 +02:00
|
|
|
ucs4_t ucs;
|
|
|
|
size_t len;
|
|
|
|
boolean copy = false;
|
|
|
|
|
|
|
|
unsigned char utf8_character = *(unsigned char *)ctoken;
|
|
|
|
if (~utf8_character & 0x80) {
|
|
|
|
ucs = utf8_character;
|
|
|
|
len = 1;
|
|
|
|
} else {
|
|
|
|
int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);
|
2011-03-07 08:02:35 +01:00
|
|
|
if (ret != 0) {
|
2012-05-17 00:52:37 +02:00
|
|
|
log_warning("illegal character sequence in UTF8 string: %s\n", ctoken);
|
2010-08-08 10:06:34 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (escape) {
|
|
|
|
copy = true;
|
|
|
|
escape = false;
|
2011-03-07 08:02:35 +01:00
|
|
|
} else if (iswxspace((wint_t) ucs)) {
|
|
|
|
if (quotechar == 0)
|
|
|
|
break;
|
2010-08-08 10:06:34 +02:00
|
|
|
copy = true;
|
2011-03-07 08:02:35 +01:00
|
|
|
} else if (utf8_character == '"' || utf8_character == '\'') {
|
|
|
|
if (utf8_character == quotechar) {
|
2010-08-08 10:06:34 +02:00
|
|
|
++ctoken;
|
|
|
|
break;
|
2011-03-07 08:02:35 +01:00
|
|
|
} else if (quotechar == 0) {
|
2010-08-08 10:06:34 +02:00
|
|
|
quotechar = utf8_character;
|
|
|
|
++ctoken;
|
|
|
|
} else {
|
|
|
|
*cursor++ = *ctoken++;
|
|
|
|
}
|
2011-03-07 08:02:35 +01:00
|
|
|
} else if (utf8_character == SPACE_REPLACEMENT) {
|
2010-08-08 10:06:34 +02:00
|
|
|
*cursor++ = ' ';
|
|
|
|
++ctoken;
|
2011-03-07 08:02:35 +01:00
|
|
|
} else if (utf8_character == ESCAPE_CHAR) {
|
2010-08-08 10:06:34 +02:00
|
|
|
escape = true;
|
|
|
|
++ctoken;
|
|
|
|
} else {
|
|
|
|
copy = true;
|
|
|
|
}
|
|
|
|
if (copy) {
|
|
|
|
memcpy(cursor, ctoken, len);
|
2011-03-07 08:02:35 +01:00
|
|
|
cursor += len;
|
|
|
|
ctoken += len;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*cursor = '\0';
|
|
|
|
*str = ctoken;
|
|
|
|
return lbuf;
|
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
const char *getstrtoken(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
|
|
|
return parse_token((const char **)&state->current_token);
|
|
|
|
}
|