server/src/common/util/parser.c

#include <config.h>
#include "parser.h"
#include "unicode.h"
#include "log.h"

#include <assert.h>
#include <wctype.h>
#include <memory.h>

#define SPACE_REPLACEMENT '~'
#define ESCAPE_CHAR       '\\'
#define MAXTOKENSIZE      8192

typedef struct parser_state {
  const xmlChar *current_token;
  xmlChar * current_cmd;
  struct parser_state * next;
} parser_state;

static parser_state * state;

static int
eatwhitespace_c(const xmlChar ** str)
{
  int ret;
  wint_t ucs;
  size_t len;

  /* skip over potential whitespace */
  for (;;) {
    xmlChar utf8_character = (*str)[0];
    if (utf8_character <= 0x7F) {
      if (!iswspace(utf8_character)) break;
      ++*str;
    } else {
      ret = unicode_utf8_to_ucs4(&ucs, *str, &len);
      if (ret!=0) {
        log_warning(("illegal character sequence in UTF8 string: %s\n", *str));
        return ret;
      }
      if (!iswspace(ucs)) break;
      *str+=len;
    }
  }
  return 0;
}

void
init_tokens_str(const xmlChar * initstr, xmlChar * cmd)
{
  if (state==NULL) {
    state = malloc(sizeof(parser_state));
  }
  else if (state->current_cmd) free(state->current_cmd);
  state->current_cmd = cmd;
  state->current_token = (const xmlChar *)initstr;
}

void
parser_pushstate(void)
{
  parser_state * new_state = malloc(sizeof(parser_state));
  new_state->current_cmd = NULL;
  new_state->current_token = NULL;
  new_state->next = state;
  state = new_state;
}

void
parser_popstate(void)
{
  parser_state * new_state = state->next;
  if (state->current_cmd!=NULL) free(state->current_cmd);
  free(state);
  state = new_state;
}

boolean
parser_end(void)
{
  eatwhitespace_c(&state->current_token);
  return *state->current_token == 0;
}

void
skip_token(void)
{
  char quotechar = 0;
  eatwhitespace_c(&state->current_token);

  while (*state->current_token) {
    wint_t ucs;
    size_t len;

    xmlChar utf8_character = state->current_token[0];
    if (utf8_character <= 0x7F) {
      ucs = utf8_character;
      ++state->current_token;
    } else {
      int ret = unicode_utf8_to_ucs4(&ucs, state->current_token, &len);
      if (ret==0) {
        state->current_token+=len;
      } else {
        log_warning(("illegal character sequence in UTF8 string: %s\n", state->current_token));
      }
    }
    if (iswspace(ucs) && quotechar==0) {
      return;
    } else {
      switch(utf8_character) {
        case '"':
        case '\'':
          if (utf8_character==quotechar) return;
          quotechar = utf8_character;
          break;
        case ESCAPE_CHAR:
          ++state->current_token;
          break;
      }
    }
  }
}

const xmlChar *
parse_token(const xmlChar ** str)
{
  static xmlChar lbuf[MAXTOKENSIZE];
  xmlChar * cursor = lbuf;
  char quotechar = 0;
  boolean escape = false;
  const xmlChar * ctoken = *str;

  assert(ctoken);

  eatwhitespace_c(&ctoken);
  while (*ctoken && cursor-lbuf < MAXTOKENSIZE-1) {
    wint_t ucs;
    size_t len;
    boolean copy = false;

    xmlChar utf8_character = *ctoken;
    if (utf8_character <= 0x7F) {
      ucs = utf8_character;
      len = 1;
    } else {
      int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);
      if (ret!=0) {
        log_warning(("illegal character sequence in UTF8 string: %s\n", ctoken));
        break;
      }
    }
    if (escape) {
      copy = true;
    } else if (iswspace(ucs)) {
      if (quotechar==0) break;
      copy = true;
    } else if (utf8_character=='"' || utf8_character=='\'') {
      if (utf8_character==quotechar) {
        ++ctoken;
        break;
      } else if (quotechar==0) {
        quotechar = utf8_character;
        ++ctoken;
      } else {
        *cursor++ = *ctoken++;
      }
    } else if (utf8_character==SPACE_REPLACEMENT) {
      *cursor++ = ' ';
      ++ctoken;
    } else if (utf8_character==ESCAPE_CHAR) {
      escape = true;
      ++ctoken;
    } else {
      copy = true;
    }
    if (copy) {
      memcpy(cursor, ctoken, len);
      cursor+=len;
      ctoken+=len;
    }
  }

  *cursor = '\0';
  *str = ctoken;
  return lbuf;
}

const xmlChar *
getstrtoken(void)
{
  return parse_token((const xmlChar**)&state->current_token);
}
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`#include <config.h>`
			`#include "parser.h"`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`#include "unicode.h"`
			`#include "log.h"`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00
			`#include <assert.h>`
Unicode WIP: Mostly Jamfile fixes for linux and some unuse code disabled 2007-06-26 11:51:18 +02:00			`#include <wctype.h>`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`#include <memory.h>`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00
			`#define SPACE_REPLACEMENT '~'`
			`#define ESCAPE_CHAR '\\'`
			`#define MAXTOKENSIZE 8192`

			`typedef struct parser_state {`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`const xmlChar *current_token;`
			`xmlChar * current_cmd;`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`struct parser_state * next;`
			`} parser_state;`

			`static parser_state * state;`

WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`static int`
			`eatwhitespace_c(const xmlChar ** str)`
			`{`
			`int ret;`
			`wint_t ucs;`
			`size_t len;`

			`/* skip over potential whitespace */`
			`for (;;) {`
			`xmlChar utf8_character = (*str)[0];`
			`if (utf8_character <= 0x7F) {`
Unicode WIP: Mostly Jamfile fixes for linux and some unuse code disabled 2007-06-26 11:51:18 +02:00			`if (!iswspace(utf8_character)) break;`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`++*str;`
			`} else {`
			`ret = unicode_utf8_to_ucs4(&ucs, *str, &len);`
			`if (ret!=0) {`
			`log_warning(("illegal character sequence in UTF8 string: %s\n", *str));`
			`return ret;`
			`}`
			`if (!iswspace(ucs)) break;`
			`*str+=len;`
			`}`
			`}`
			`return 0;`
			`}`

Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`void`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`init_tokens_str(const xmlChar * initstr, xmlChar * cmd)`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`{`
			`if (state==NULL) {`
			`state = malloc(sizeof(parser_state));`
			`}`
added an option to have certain races supply a description without a need to set u->display everywhere. This was costing 24 MB of memory just for the braineaters alone, a terrible waste. 2007-06-25 03:50:34 +02:00			`else if (state->current_cmd) free(state->current_cmd);`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`state->current_cmd = cmd;`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`state->current_token = (const xmlChar *)initstr;`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`}`

			`void`
			`parser_pushstate(void)`
			`{`
			`parser_state * new_state = malloc(sizeof(parser_state));`
			`new_state->current_cmd = NULL;`
			`new_state->current_token = NULL;`
			`new_state->next = state;`
			`state = new_state;`
			`}`

			`void`
			`parser_popstate(void)`
			`{`
			`parser_state * new_state = state->next;`
			`if (state->current_cmd!=NULL) free(state->current_cmd);`
			`free(state);`
			`state = new_state;`
			`}`

			`boolean`
			`parser_end(void)`
			`{`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`eatwhitespace_c(&state->current_token);`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`return *state->current_token == 0;`
			`}`

			`void`
			`skip_token(void)`
			`{`
			`char quotechar = 0;`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`eatwhitespace_c(&state->current_token);`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00
			`while (*state->current_token) {`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`wint_t ucs;`
			`size_t len;`

			`xmlChar utf8_character = state->current_token[0];`
			`if (utf8_character <= 0x7F) {`
			`ucs = utf8_character;`
			`++state->current_token;`
			`} else {`
			`int ret = unicode_utf8_to_ucs4(&ucs, state->current_token, &len);`
			`if (ret==0) {`
			`state->current_token+=len;`
			`} else {`
			`log_warning(("illegal character sequence in UTF8 string: %s\n", state->current_token));`
			`}`
			`}`
			`if (iswspace(ucs) && quotechar==0) {`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`return;`
			`} else {`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`switch(utf8_character) {`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`case '"':`
			`case '\'':`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`if (utf8_character==quotechar) return;`
			`quotechar = utf8_character;`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`break;`
			`case ESCAPE_CHAR:`
			`++state->current_token;`
			`break;`
			`}`
			`}`
			`}`
			`}`

WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`const xmlChar *`
			`parse_token(const xmlChar ** str)`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`{`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`static xmlChar lbuf[MAXTOKENSIZE];`
			`xmlChar * cursor = lbuf;`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`char quotechar = 0;`
			`boolean escape = false;`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`const xmlChar * ctoken = *str;`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00
			`assert(ctoken);`

WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`eatwhitespace_c(&ctoken);`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`while (*ctoken && cursor-lbuf < MAXTOKENSIZE-1) {`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`wint_t ucs;`
			`size_t len;`
			`boolean copy = false;`

			`xmlChar utf8_character = *ctoken;`
			`if (utf8_character <= 0x7F) {`
			`ucs = utf8_character;`
			`len = 1;`
			`} else {`
			`int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);`
			`if (ret!=0) {`
			`log_warning(("illegal character sequence in UTF8 string: %s\n", ctoken));`
			`break;`
			`}`
			`}`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`if (escape) {`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`copy = true;`
			`} else if (iswspace(ucs)) {`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`if (quotechar==0) break;`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`copy = true;`
			`} else if (utf8_character=='"' \|\| utf8_character=='\'') {`
			`if (utf8_character==quotechar) {`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`++ctoken;`
			`break;`
			`} else if (quotechar==0) {`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`quotechar = utf8_character;`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`++ctoken;`
			`} else {`
			`cursor++ = ctoken++;`
			`}`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`} else if (utf8_character==SPACE_REPLACEMENT) {`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`*cursor++ = ' ';`
			`++ctoken;`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`} else if (utf8_character==ESCAPE_CHAR) {`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`escape = true;`
			`++ctoken;`
			`} else {`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`copy = true;`
			`}`
			`if (copy) {`
			`memcpy(cursor, ctoken, len);`
			`cursor+=len;`
			`ctoken+=len;`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`}`
			`}`

			`*cursor = '\0';`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`*str = ctoken;`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`return lbuf;`
			`}`

WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`const xmlChar *`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`getstrtoken(void)`
			`{`
WIP, does not compile: Unicode (UTF8) conversion of absolutely everything. Mi'kmaq hieroglyphic writing FTW! 2007-06-26 11:32:28 +02:00			`return parse_token((const xmlChar**)&state->current_token);`
Cleanup: Removing parser-code from kernel/eressea.[hc] and putting it into separate parser files in util/ 2007-06-22 00:31:28 +02:00			`}`