server/src/util/parser.c

#include <platform.h>
#include "parser.h"
#include "unicode.h"
#include "log.h"

#include <assert.h>
#include <stdlib.h>
#include <wctype.h>
#include <memory.h>

#define SPACE_REPLACEMENT '~'
#define ESCAPE_CHAR       '\\'
#define MAXTOKENSIZE      8192

typedef struct parser_state {
  const char *current_token;
  char *current_cmd;
  struct parser_state *next;
} parser_state;

static parser_state *state;

static int eatwhitespace_c(const char **str_p)
{
  int ret = 0;
  ucs4_t ucs;
  size_t len;
  const char *str = *str_p;

  /* skip over potential whitespace */
  for (;;) {
    unsigned char utf8_character = (unsigned char)*str;
    if (~utf8_character & 0x80) {
      if (!iswxspace(utf8_character))
        break;
      ++str;
    } else {
      ret = unicode_utf8_to_ucs4(&ucs, str, &len);
      if (ret != 0) {
        log_warning("illegal character sequence in UTF8 string: %s\n", str);
        break;
      }
      if (!iswxspace((wint_t) ucs))
        break;
      str += len;
    }
  }
  *str_p = str;
  return ret;
}

void init_tokens_str(const char *initstr, char *cmd)
{
  if (state == NULL) {
    state = malloc(sizeof(parser_state));
  } else if (state->current_cmd)
    free(state->current_cmd);
  state->current_cmd = cmd;
  state->current_token = initstr;
}

void parser_pushstate(void)
{
  parser_state *new_state = malloc(sizeof(parser_state));
  new_state->current_cmd = NULL;
  new_state->current_token = NULL;
  new_state->next = state;
  state = new_state;
}

void parser_popstate(void)
{
  parser_state *new_state = state->next;
  if (state->current_cmd != NULL)
    free(state->current_cmd);
  free(state);
  state = new_state;
}

bool parser_end(void)
{
  eatwhitespace_c(&state->current_token);
  return *state->current_token == 0;
}

void skip_token(void)
{
  char quotechar = 0;
  eatwhitespace_c(&state->current_token);

  while (*state->current_token) {
    ucs4_t ucs;
    size_t len;

    unsigned char utf8_character = (unsigned char)state->current_token[0];
    if (~utf8_character & 0x80) {
      ucs = utf8_character;
      ++state->current_token;
    } else {
      int ret = unicode_utf8_to_ucs4(&ucs, state->current_token, &len);
      if (ret == 0) {
        state->current_token += len;
      } else {
        log_warning("illegal character sequence in UTF8 string: %s\n", state->current_token);
      }
    }
    if (iswxspace((wint_t) ucs) && quotechar == 0) {
      return;
    } else {
      switch (utf8_character) {
        case '"':
        case '\'':
          if (utf8_character == quotechar)
            return;
          quotechar = utf8_character;
          break;
        case ESCAPE_CHAR:
          ++state->current_token;
          break;
      }
    }
  }
}

const char *parse_token(const char **str)
{
  static char lbuf[MAXTOKENSIZE];       /* STATIC_RESULT: used for return, not across calls */
  char *cursor = lbuf;
  char quotechar = 0;
  bool escape = false;
  const char *ctoken = *str;

  assert(ctoken);

  eatwhitespace_c(&ctoken);
  while (*ctoken && cursor - lbuf < MAXTOKENSIZE - 1) {
    ucs4_t ucs;
    size_t len;
    bool copy = false;

    unsigned char utf8_character = *(unsigned char *)ctoken;
    if (~utf8_character & 0x80) {
      ucs = utf8_character;
      len = 1;
    } else {
      int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);
      if (ret != 0) {
        log_warning("illegal character sequence in UTF8 string: %s\n", ctoken);
        break;
      }
    }
    if (escape) {
      copy = true;
      escape = false;
    } else if (iswxspace((wint_t) ucs)) {
      if (quotechar == 0)
        break;
      copy = true;
    } else if (utf8_character == '"' || utf8_character == '\'') {
      if (utf8_character == quotechar) {
        ++ctoken;
        break;
      } else if (quotechar == 0) {
        quotechar = utf8_character;
        ++ctoken;
      } else {
        *cursor++ = *ctoken++;
      }
    } else if (utf8_character == SPACE_REPLACEMENT) {
      *cursor++ = ' ';
      ++ctoken;
    } else if (utf8_character == ESCAPE_CHAR) {
      escape = true;
      ++ctoken;
    } else {
      copy = true;
    }
    if (copy) {
      memcpy(cursor, ctoken, len);
      cursor += len;
      ctoken += len;
    }
  }

  *cursor = '\0';
  *str = ctoken;
  return lbuf;
}

const char *getstrtoken(void)
{
  return parse_token((const char **)&state->current_token);
}
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`#include <platform.h>`
			`#include "parser.h"`
			`#include "unicode.h"`
			`#include "log.h"`

			`#include <assert.h>`
use cmake for autoconf, slim down platform.h 2014-03-15 19:29:11 +01:00			`#include <stdlib.h>`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`#include <wctype.h>`
			`#include <memory.h>`

			`#define SPACE_REPLACEMENT '~'`
			`#define ESCAPE_CHAR '\\'`
			`#define MAXTOKENSIZE 8192`

			`typedef struct parser_state {`
			`const char *current_token;`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`char *current_cmd;`
			`struct parser_state *next;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`} parser_state;`

indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`static parser_state *state;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`static int eatwhitespace_c(const char **str_p)`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`{`
			`int ret = 0;`
			`ucs4_t ucs;`
			`size_t len;`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`const char str = str_p;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00
			`/* skip over potential whitespace */`
			`for (;;) {`
			`unsigned char utf8_character = (unsigned char)*str;`
			`if (~utf8_character & 0x80) {`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`if (!iswxspace(utf8_character))`
			`break;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`++str;`
			`} else {`
			`ret = unicode_utf8_to_ucs4(&ucs, str, &len);`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`if (ret != 0) {`
rename _log_warn to log_warning and get rid of the cumbersome #define 2012-05-17 00:52:37 +02:00			`log_warning("illegal character sequence in UTF8 string: %s\n", str);`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`break;`
			`}`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`if (!iswxspace((wint_t) ucs))`
			`break;`
			`str += len;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`}`
			`}`
			`*str_p = str;`
			`return ret;`
			`}`

indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`void init_tokens_str(const char initstr, char cmd)`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`{`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`if (state == NULL) {`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`state = malloc(sizeof(parser_state));`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`} else if (state->current_cmd)`
			`free(state->current_cmd);`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`state->current_cmd = cmd;`
			`state->current_token = initstr;`
			`}`

indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`void parser_pushstate(void)`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`{`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`parser_state *new_state = malloc(sizeof(parser_state));`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`new_state->current_cmd = NULL;`
			`new_state->current_token = NULL;`
			`new_state->next = state;`
			`state = new_state;`
			`}`

indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`void parser_popstate(void)`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`{`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`parser_state *new_state = state->next;`
			`if (state->current_cmd != NULL)`
			`free(state->current_cmd);`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`free(state);`
			`state = new_state;`
			`}`

remove custom boolean type use bool when we have C99, or our own typedef for bool 2012-06-24 07:41:07 +02:00			`bool parser_end(void)`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`{`
			`eatwhitespace_c(&state->current_token);`
			`return *state->current_token == 0;`
			`}`

indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`void skip_token(void)`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`{`
			`char quotechar = 0;`
			`eatwhitespace_c(&state->current_token);`

			`while (*state->current_token) {`
			`ucs4_t ucs;`
			`size_t len;`

			`unsigned char utf8_character = (unsigned char)state->current_token[0];`
			`if (~utf8_character & 0x80) {`
			`ucs = utf8_character;`
			`++state->current_token;`
			`} else {`
			`int ret = unicode_utf8_to_ucs4(&ucs, state->current_token, &len);`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`if (ret == 0) {`
			`state->current_token += len;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`} else {`
rename _log_warn to log_warning and get rid of the cumbersome #define 2012-05-17 00:52:37 +02:00			`log_warning("illegal character sequence in UTF8 string: %s\n", state->current_token);`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`}`
			`}`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`if (iswxspace((wint_t) ucs) && quotechar == 0) {`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`return;`
			`} else {`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`switch (utf8_character) {`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`case '"':`
			`case '\'':`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`if (utf8_character == quotechar)`
			`return;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`quotechar = utf8_character;`
			`break;`
			`case ESCAPE_CHAR:`
			`++state->current_token;`
			`break;`
			`}`
			`}`
			`}`
			`}`

indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`const char parse_token(const char *str)`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`{`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`static char lbuf[MAXTOKENSIZE]; /* STATIC_RESULT: used for return, not across calls */`
			`char *cursor = lbuf;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`char quotechar = 0;`
remove custom boolean type use bool when we have C99, or our own typedef for bool 2012-06-24 07:41:07 +02:00			`bool escape = false;`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`const char ctoken = str;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00
			`assert(ctoken);`

			`eatwhitespace_c(&ctoken);`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`while (*ctoken && cursor - lbuf < MAXTOKENSIZE - 1) {`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`ucs4_t ucs;`
			`size_t len;`
remove custom boolean type use bool when we have C99, or our own typedef for bool 2012-06-24 07:41:07 +02:00			`bool copy = false;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00
			`unsigned char utf8_character = (unsigned char )ctoken;`
			`if (~utf8_character & 0x80) {`
			`ucs = utf8_character;`
			`len = 1;`
			`} else {`
			`int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`if (ret != 0) {`
rename _log_warn to log_warning and get rid of the cumbersome #define 2012-05-17 00:52:37 +02:00			`log_warning("illegal character sequence in UTF8 string: %s\n", ctoken);`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`break;`
			`}`
			`}`
			`if (escape) {`
			`copy = true;`
			`escape = false;`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`} else if (iswxspace((wint_t) ucs)) {`
			`if (quotechar == 0)`
			`break;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`copy = true;`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`} else if (utf8_character == '"' \|\| utf8_character == '\'') {`
			`if (utf8_character == quotechar) {`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`++ctoken;`
			`break;`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`} else if (quotechar == 0) {`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`quotechar = utf8_character;`
			`++ctoken;`
			`} else {`
			`cursor++ = ctoken++;`
			`}`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`} else if (utf8_character == SPACE_REPLACEMENT) {`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`*cursor++ = ' ';`
			`++ctoken;`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`} else if (utf8_character == ESCAPE_CHAR) {`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`escape = true;`
			`++ctoken;`
			`} else {`
			`copy = true;`
			`}`
			`if (copy) {`
			`memcpy(cursor, ctoken, len);`
indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`cursor += len;`
			`ctoken += len;`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`}`
			`}`

			`*cursor = '\0';`
			`*str = ctoken;`
			`return lbuf;`
			`}`

indentation rules, strictly applied 2011-03-07 08:02:35 +01:00			`const char *getstrtoken(void)`
undo more CRLF screwups. 2010-08-08 10:06:34 +02:00			`{`
			`return parse_token((const char **)&state->current_token);`
			`}`