2018-09-29 18:13:32 +02:00
|
|
|
#ifdef _MSC_VER
|
2010-08-08 10:06:34 +02:00
|
|
|
#include <platform.h>
|
2018-09-29 18:13:32 +02:00
|
|
|
#endif
|
2010-08-08 10:06:34 +02:00
|
|
|
#include "parser.h"
|
|
|
|
#include "unicode.h"
|
2014-12-18 07:09:22 +01:00
|
|
|
#include "base36.h"
|
2010-08-08 10:06:34 +02:00
|
|
|
#include "log.h"
|
|
|
|
|
|
|
|
#include <assert.h>
|
2014-03-15 19:29:11 +01:00
|
|
|
#include <stdlib.h>
|
2010-08-08 10:06:34 +02:00
|
|
|
#include <wctype.h>
|
|
|
|
#include <memory.h>
|
|
|
|
|
|
|
|
#define SPACE_REPLACEMENT '~'
|
|
|
|
#define ESCAPE_CHAR '\\'
|
|
|
|
#define MAXTOKENSIZE 8192
|
|
|
|
|
2019-01-18 22:26:55 +01:00
|
|
|
typedef struct parse_state {
|
2015-01-30 20:37:14 +01:00
|
|
|
const char *current_token;
|
2019-01-18 22:26:55 +01:00
|
|
|
struct parse_state *next;
|
2017-11-10 07:56:56 +01:00
|
|
|
void *data;
|
|
|
|
void(*dtor)(void *);
|
2019-01-18 22:26:55 +01:00
|
|
|
} parse_state;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2019-01-18 22:26:55 +01:00
|
|
|
static parse_state *states;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
static int eatwhitespace_c(const char **str_p)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
int ret = 0;
|
2019-08-01 18:40:42 +02:00
|
|
|
wint_t wc;
|
2015-01-30 20:37:14 +01:00
|
|
|
size_t len;
|
|
|
|
const char *str = *str_p;
|
|
|
|
|
|
|
|
/* skip over potential whitespace */
|
|
|
|
for (;;) {
|
|
|
|
unsigned char utf8_character = (unsigned char)*str;
|
|
|
|
if (~utf8_character & 0x80) {
|
2017-01-10 18:05:48 +01:00
|
|
|
if (!iswspace(utf8_character))
|
2015-01-30 20:37:14 +01:00
|
|
|
break;
|
|
|
|
++str;
|
|
|
|
}
|
|
|
|
else {
|
2019-08-01 18:40:42 +02:00
|
|
|
ret = unicode_utf8_decode(&wc, str, &len);
|
2015-01-30 20:37:14 +01:00
|
|
|
if (ret != 0) {
|
|
|
|
log_warning("illegal character sequence in UTF8 string: %s\n", str);
|
|
|
|
break;
|
|
|
|
}
|
2019-08-01 18:40:42 +02:00
|
|
|
if (!iswspace(wc))
|
2015-01-30 20:37:14 +01:00
|
|
|
break;
|
|
|
|
str += len;
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
2015-01-30 20:37:14 +01:00
|
|
|
*str_p = str;
|
|
|
|
return ret;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2017-11-10 07:56:56 +01:00
|
|
|
void init_tokens_ex(const char *initstr, void *data, void (*dtor)(void *))
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
if (states == NULL) {
|
2019-01-18 22:26:55 +01:00
|
|
|
states = calloc(1, sizeof(parse_state));
|
2018-11-26 22:01:18 +01:00
|
|
|
if (!states) abort();
|
2015-01-30 20:37:14 +01:00
|
|
|
}
|
2017-11-10 07:56:56 +01:00
|
|
|
else if (states->dtor) {
|
|
|
|
states->dtor(states->data);
|
|
|
|
}
|
|
|
|
states->dtor = dtor;
|
|
|
|
states->data = data;
|
2015-01-30 20:37:14 +01:00
|
|
|
states->current_token = initstr;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2017-11-10 07:56:56 +01:00
|
|
|
void init_tokens_str(const char *initstr) {
|
|
|
|
init_tokens_ex(initstr, NULL, NULL);
|
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
void parser_pushstate(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2019-01-18 22:26:55 +01:00
|
|
|
parse_state *new_state = calloc(1, sizeof(parse_state));
|
2018-11-26 22:01:18 +01:00
|
|
|
if (!new_state) abort();
|
2015-01-30 20:37:14 +01:00
|
|
|
new_state->current_token = NULL;
|
|
|
|
new_state->next = states;
|
|
|
|
states = new_state;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
void parser_popstate(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2019-01-18 22:26:55 +01:00
|
|
|
parse_state *new_state = states->next;
|
2017-11-10 07:56:56 +01:00
|
|
|
if (states->dtor) {
|
|
|
|
states->dtor(states->data);
|
|
|
|
}
|
2015-01-30 20:37:14 +01:00
|
|
|
free(states);
|
|
|
|
states = new_state;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2012-06-24 07:41:07 +02:00
|
|
|
bool parser_end(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2014-08-23 09:17:58 +02:00
|
|
|
if (states->current_token) {
|
|
|
|
eatwhitespace_c(&states->current_token);
|
|
|
|
return *states->current_token == 0;
|
|
|
|
}
|
|
|
|
return true;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
void skip_token(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
char quotechar = 0;
|
|
|
|
eatwhitespace_c(&states->current_token);
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
while (*states->current_token) {
|
2019-08-01 18:40:42 +02:00
|
|
|
wint_t wc;
|
2015-01-30 20:37:14 +01:00
|
|
|
size_t len;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
unsigned char utf8_character = (unsigned char)states->current_token[0];
|
|
|
|
if (~utf8_character & 0x80) {
|
2019-08-01 18:40:42 +02:00
|
|
|
wc = utf8_character;
|
2015-01-30 20:37:14 +01:00
|
|
|
++states->current_token;
|
|
|
|
}
|
|
|
|
else {
|
2019-08-01 18:40:42 +02:00
|
|
|
int ret = unicode_utf8_decode(&wc, states->current_token, &len);
|
2015-01-30 20:37:14 +01:00
|
|
|
if (ret == 0) {
|
|
|
|
states->current_token += len;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
log_warning("illegal character sequence in UTF8 string: %s\n", states->current_token);
|
|
|
|
}
|
|
|
|
}
|
2019-08-01 18:40:42 +02:00
|
|
|
if (iswspace(wc) && quotechar == 0) {
|
2011-03-07 08:02:35 +01:00
|
|
|
return;
|
2015-01-30 20:37:14 +01:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
switch (utf8_character) {
|
|
|
|
case '"':
|
|
|
|
case '\'':
|
|
|
|
if (utf8_character == quotechar)
|
|
|
|
return;
|
|
|
|
quotechar = utf8_character;
|
|
|
|
break;
|
|
|
|
case ESCAPE_CHAR:
|
|
|
|
++states->current_token;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-24 15:55:55 +01:00
|
|
|
char *parse_token(const char **str, char *lbuf, size_t buflen)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2014-12-22 14:21:24 +01:00
|
|
|
char *cursor = lbuf;
|
|
|
|
char quotechar = 0;
|
|
|
|
bool escape = false;
|
2020-07-26 12:59:49 +02:00
|
|
|
const char *ctoken = *str, *cstart;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2014-12-22 14:21:24 +01:00
|
|
|
if (!ctoken) {
|
|
|
|
return 0;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
2014-12-22 14:21:24 +01:00
|
|
|
eatwhitespace_c(&ctoken);
|
|
|
|
if (!*ctoken) {
|
2014-12-24 15:55:55 +01:00
|
|
|
if (buflen > 0) {
|
2014-12-22 16:28:17 +01:00
|
|
|
*cursor = 0;
|
|
|
|
}
|
2014-12-22 14:21:24 +01:00
|
|
|
return 0;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
2020-07-26 12:59:49 +02:00
|
|
|
cstart = ctoken;
|
2014-12-24 15:55:55 +01:00
|
|
|
while (*ctoken) {
|
2019-08-01 18:40:42 +02:00
|
|
|
wint_t wc;
|
2014-12-22 14:21:24 +01:00
|
|
|
size_t len;
|
|
|
|
bool copy = false;
|
|
|
|
|
|
|
|
unsigned char utf8_character = *(unsigned char *)ctoken;
|
|
|
|
if (~utf8_character & 0x80) {
|
2019-08-01 18:40:42 +02:00
|
|
|
wc = utf8_character;
|
2014-12-22 14:21:24 +01:00
|
|
|
len = 1;
|
|
|
|
}
|
|
|
|
else {
|
2019-08-01 18:40:42 +02:00
|
|
|
int ret = unicode_utf8_decode(&wc, ctoken, &len);
|
2014-12-22 14:21:24 +01:00
|
|
|
if (ret != 0) {
|
2020-08-09 20:56:43 +02:00
|
|
|
log_info("falling back to ISO-8859-1: %s\n", cstart);
|
|
|
|
if (cursor - buflen < lbuf - 2) {
|
|
|
|
size_t inlen = 1;
|
|
|
|
len = 2;
|
|
|
|
unicode_latin1_to_utf8(cursor, &len, ctoken, &inlen);
|
|
|
|
cursor += len;
|
|
|
|
ctoken += inlen;
|
|
|
|
continue;
|
|
|
|
}
|
2014-12-22 14:21:24 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (escape) {
|
|
|
|
copy = true;
|
|
|
|
escape = false;
|
|
|
|
}
|
2019-08-01 18:40:42 +02:00
|
|
|
else if (iswspace(wc)) {
|
2014-12-22 14:21:24 +01:00
|
|
|
if (quotechar == 0)
|
|
|
|
break;
|
|
|
|
copy = true;
|
|
|
|
}
|
|
|
|
else if (utf8_character == '"' || utf8_character == '\'') {
|
|
|
|
if (utf8_character == quotechar) {
|
|
|
|
++ctoken;
|
|
|
|
break;
|
|
|
|
}
|
2020-07-26 12:59:49 +02:00
|
|
|
else if (quotechar == 0 && cstart == ctoken) {
|
2014-12-22 14:21:24 +01:00
|
|
|
quotechar = utf8_character;
|
|
|
|
++ctoken;
|
|
|
|
}
|
|
|
|
else {
|
2017-10-29 18:03:44 +01:00
|
|
|
if (cursor - buflen < lbuf - len) {
|
|
|
|
*cursor++ = *ctoken++;
|
|
|
|
}
|
2014-12-22 14:21:24 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (utf8_character == SPACE_REPLACEMENT) {
|
2017-10-29 18:03:44 +01:00
|
|
|
if (cursor - buflen < lbuf - len) {
|
|
|
|
*cursor++ = ' ';
|
|
|
|
}
|
2014-12-22 14:21:24 +01:00
|
|
|
++ctoken;
|
|
|
|
}
|
|
|
|
else if (utf8_character == ESCAPE_CHAR) {
|
|
|
|
escape = true;
|
|
|
|
++ctoken;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
copy = true;
|
|
|
|
}
|
|
|
|
if (copy) {
|
2016-06-07 21:47:09 +02:00
|
|
|
if (cursor - buflen < lbuf - len) {
|
2014-12-24 15:55:55 +01:00
|
|
|
memcpy(cursor, ctoken, len);
|
|
|
|
cursor += len;
|
|
|
|
}
|
2014-12-22 14:21:24 +01:00
|
|
|
ctoken += len;
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2014-12-22 14:21:24 +01:00
|
|
|
*cursor = '\0';
|
2020-03-14 11:57:29 +01:00
|
|
|
unicode_utf8_trim(lbuf);
|
2014-12-22 14:21:24 +01:00
|
|
|
*str = ctoken;
|
|
|
|
return lbuf;
|
|
|
|
}
|
|
|
|
|
2014-12-29 07:44:00 +01:00
|
|
|
static char pbuf[MAXTOKENSIZE]; /* STATIC_RESULT: used for return, not across calls */
|
2014-12-22 14:21:24 +01:00
|
|
|
const char *parse_token_depr(const char **str)
|
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
return parse_token(str, pbuf, MAXTOKENSIZE);
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2019-04-24 12:55:41 +02:00
|
|
|
char *getstrtoken(void)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2014-12-29 07:44:00 +01:00
|
|
|
return parse_token((const char **)&states->current_token, pbuf, MAXTOKENSIZE);
|
2014-12-22 14:21:24 +01:00
|
|
|
}
|
|
|
|
|
2019-04-24 12:55:41 +02:00
|
|
|
char *gettoken(char *lbuf, size_t bufsize)
|
2014-12-22 14:21:24 +01:00
|
|
|
{
|
|
|
|
return parse_token((const char **)&states->current_token, lbuf, bufsize);
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
2014-12-18 07:09:22 +01:00
|
|
|
|
2014-12-22 16:31:10 +01:00
|
|
|
int getint(void)
|
|
|
|
{
|
|
|
|
char token[16];
|
|
|
|
const char * s = gettoken(token, sizeof(token));
|
|
|
|
return s ? atoi(s) : 0;
|
|
|
|
}
|
|
|
|
|
2018-01-01 08:23:52 +01:00
|
|
|
int getuint(void)
|
2014-12-22 16:31:10 +01:00
|
|
|
{
|
|
|
|
int n = getint();
|
|
|
|
return (n < 0) ? 0 : n;
|
|
|
|
}
|
|
|
|
|
2014-12-18 07:09:22 +01:00
|
|
|
int getid(void)
|
|
|
|
{
|
2014-12-24 15:55:55 +01:00
|
|
|
char token[8];
|
2014-12-22 16:28:17 +01:00
|
|
|
const char *str = gettoken(token, sizeof(token));
|
2014-12-18 07:09:22 +01:00
|
|
|
int i = str ? atoi36(str) : 0;
|
|
|
|
if (i < 0) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return i;
|
|
|
|
}
|
2014-12-23 09:23:37 +01:00
|
|
|
|
|
|
|
unsigned int atoip(const char *s)
|
|
|
|
{
|
|
|
|
int n;
|
|
|
|
|
|
|
|
assert(s);
|
2017-11-10 07:56:56 +01:00
|
|
|
n = (s[0] >= '0' && s[0] <= '9');
|
2017-02-03 21:19:39 +01:00
|
|
|
n = n ? atoi(s) : 0;
|
2014-12-23 09:23:37 +01:00
|
|
|
|
|
|
|
if (n < 0)
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
return n;
|
|
|
|
}
|