2010-08-08 10:06:34 +02:00
|
|
|
#include <platform.h>
|
|
|
|
#include "filereader.h"
|
|
|
|
|
|
|
|
#include <util/log.h>
|
|
|
|
#include <util/unicode.h>
|
|
|
|
|
2017-01-10 18:05:48 +01:00
|
|
|
#include <stdbool.h>
|
2010-08-08 10:06:34 +02:00
|
|
|
#include <ctype.h>
|
|
|
|
#include <wctype.h>
|
|
|
|
|
|
|
|
#define COMMENT_CHAR ';'
|
|
|
|
#define CONTINUE_CHAR '\\'
|
|
|
|
#define MAXLINE 4096*16
|
|
|
|
static char lbuf[MAXLINE];
|
|
|
|
static char fbuf[MAXLINE];
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
static void unicode_warning(const char *bp)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
log_warning("invalid sequence in UTF-8 string: %s\n", bp);
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2016-11-26 16:21:41 +01:00
|
|
|
static int eatwhite(const char *ptr, size_t * total_size)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
int ret = 0;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
*total_size = 0;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
while (*ptr) {
|
|
|
|
ucs4_t ucs;
|
|
|
|
size_t size = 0;
|
|
|
|
ret = unicode_utf8_to_ucs4(&ucs, ptr, &size);
|
|
|
|
if (ret != 0)
|
|
|
|
break;
|
2017-01-10 18:05:48 +01:00
|
|
|
if (!iswspace((wint_t)ucs))
|
2015-01-30 20:37:14 +01:00
|
|
|
break;
|
|
|
|
*total_size += size;
|
|
|
|
ptr += size;
|
|
|
|
}
|
|
|
|
return ret;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
static const char *getbuf_latin1(FILE * F)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
bool cont = false;
|
|
|
|
char quote = 0;
|
|
|
|
bool comment = false;
|
|
|
|
char *cp = fbuf;
|
|
|
|
char *tail = lbuf + MAXLINE - 2;
|
2011-03-07 08:02:35 +01:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
tail[1] = '@'; /* if this gets overwritten by fgets then the line was very long. */
|
|
|
|
do {
|
|
|
|
const char *bp = fgets(lbuf, MAXLINE, F);
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (bp == NULL)
|
|
|
|
return NULL;
|
2017-01-10 18:05:48 +01:00
|
|
|
while (*bp && isspace(*(unsigned char *)bp))
|
2015-01-30 20:37:14 +01:00
|
|
|
++bp; /* eatwhite */
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
comment = (bool)(comment && cont);
|
|
|
|
quote = (bool)(quote && cont);
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (tail[1] == 0) {
|
|
|
|
/* we read he maximum number of bytes! */
|
|
|
|
if (tail[0] != '\n') {
|
|
|
|
/* it wasn't enough space to finish the line, eat the rest */
|
|
|
|
for (;;) {
|
|
|
|
tail[1] = '@';
|
|
|
|
bp = fgets(lbuf, MAXLINE, F);
|
|
|
|
if (bp == NULL)
|
|
|
|
return NULL;
|
|
|
|
if (tail[1]) {
|
|
|
|
/* read enough this time to end the line */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
comment = false;
|
|
|
|
cont = false;
|
|
|
|
bp = NULL;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
tail[1] = '@';
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
cont = false;
|
2015-01-30 20:37:14 +01:00
|
|
|
while (*bp && cp < fbuf + MAXLINE) {
|
|
|
|
int c = *(unsigned char *)bp;
|
2011-03-07 08:02:35 +01:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (c == '\n' || c == '\r') {
|
|
|
|
/* line breaks, shmine breaks */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (c == COMMENT_CHAR && !quote) {
|
|
|
|
/* comment begins. we need to keep going, to look for CONTINUE_CHAR */
|
|
|
|
comment = true;
|
|
|
|
++bp;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!comment && (c == '"' || c == '\'')) {
|
|
|
|
if (quote == c) {
|
|
|
|
quote = 0;
|
|
|
|
if (cp < fbuf + MAXLINE)
|
|
|
|
*cp++ = *bp;
|
|
|
|
++bp;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else if (!quote) {
|
|
|
|
quote = *bp++;
|
|
|
|
if (cp < fbuf + MAXLINE)
|
|
|
|
*cp++ = quote;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (iscntrl(c)) {
|
|
|
|
if (!comment && cp < fbuf + MAXLINE) {
|
2017-01-10 18:05:48 +01:00
|
|
|
*cp++ = isspace(c) ? ' ' : '?';
|
2015-01-30 20:37:14 +01:00
|
|
|
}
|
|
|
|
++bp;
|
|
|
|
continue;
|
|
|
|
}
|
2017-01-10 18:05:48 +01:00
|
|
|
else if (isspace(c)) {
|
2015-01-30 20:37:14 +01:00
|
|
|
if (!quote) {
|
|
|
|
++bp;
|
2017-01-10 18:05:48 +01:00
|
|
|
while (*bp && isspace(*(unsigned char *)bp))
|
2015-01-30 20:37:14 +01:00
|
|
|
++bp; /* eatwhite */
|
|
|
|
if (!comment && *bp && *bp != COMMENT_CHAR && cp < fbuf + MAXLINE)
|
|
|
|
*(cp++) = ' ';
|
|
|
|
}
|
|
|
|
else if (!comment && cp + 1 <= fbuf + MAXLINE) {
|
|
|
|
*(cp++) = *(bp++);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
++bp;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else if (c == CONTINUE_CHAR) {
|
|
|
|
const char *end = ++bp;
|
2017-01-10 18:05:48 +01:00
|
|
|
while (*end && isspace(*(unsigned char *)end))
|
2015-01-30 20:37:14 +01:00
|
|
|
++end; /* eatwhite */
|
|
|
|
if (*end == '\0') {
|
|
|
|
bp = end;
|
|
|
|
cont = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (comment) {
|
|
|
|
++bp;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (comment) {
|
|
|
|
++bp;
|
|
|
|
continue;
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (c < 0x80) {
|
|
|
|
if (cp + 1 <= fbuf + MAXLINE) {
|
|
|
|
*(cp++) = *(bp++);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
char inbuf = (char)c;
|
|
|
|
size_t inbytes = 1;
|
|
|
|
size_t outbytes = MAXLINE - (cp - fbuf);
|
|
|
|
int ret = unicode_latin1_to_utf8(cp, &outbytes, &inbuf, &inbytes);
|
|
|
|
if (ret > 0)
|
|
|
|
cp += ret;
|
|
|
|
else {
|
|
|
|
log_error("input data was not iso-8859-1! assuming utf-8\n");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
++bp;
|
|
|
|
continue;
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
2015-01-30 20:37:14 +01:00
|
|
|
if (cp == fbuf + MAXLINE) {
|
|
|
|
--cp;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
2015-01-30 20:37:14 +01:00
|
|
|
*cp = 0;
|
|
|
|
} while (cont || cp == fbuf);
|
|
|
|
return fbuf;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
static const char *getbuf_utf8(FILE * F)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
bool cont = false;
|
|
|
|
char quote = 0;
|
|
|
|
bool comment = false;
|
|
|
|
char *cp = fbuf;
|
|
|
|
char *tail = lbuf + MAXLINE - 2;
|
2011-03-07 08:02:35 +01:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
tail[1] = '@'; /* if this gets overwritten by fgets then the line was very long. */
|
|
|
|
do {
|
|
|
|
const char *bp = fgets(lbuf, MAXLINE, F);
|
|
|
|
size_t white;
|
|
|
|
if (bp == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
eatwhite(bp, &white); /* decoding errors will get caught later on, don't have to check */
|
|
|
|
bp += white;
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
comment = (bool)(comment && cont);
|
|
|
|
quote = (bool)(quote && cont);
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (tail[1] == 0) {
|
|
|
|
/* we read the maximum number of bytes! */
|
|
|
|
if (tail[0] != '\n') {
|
|
|
|
/* it wasn't enough space to finish the line, eat the rest */
|
|
|
|
for (;;) {
|
|
|
|
tail[1] = '@';
|
|
|
|
bp = fgets(lbuf, MAXLINE, F);
|
|
|
|
if (bp == NULL)
|
|
|
|
return NULL;
|
|
|
|
if (tail[1]) {
|
|
|
|
/* read enough this time to end the line */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
comment = false;
|
|
|
|
cont = false;
|
|
|
|
bp = NULL;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
tail[1] = '@';
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
cont = false;
|
2015-01-30 20:37:14 +01:00
|
|
|
while (*bp && cp < fbuf + MAXLINE) {
|
|
|
|
ucs4_t ucs;
|
|
|
|
size_t size;
|
|
|
|
int ret;
|
2011-03-07 08:02:35 +01:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (!quote) {
|
|
|
|
while (*bp == COMMENT_CHAR) {
|
|
|
|
/* comment begins. we need to keep going, to look for CONTINUE_CHAR */
|
|
|
|
comment = true;
|
|
|
|
++bp;
|
|
|
|
}
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (*bp == '\n' || *bp == '\r') {
|
|
|
|
/* line breaks, shmine breaks */
|
|
|
|
break;
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (*bp == '"' || *bp == '\'') {
|
|
|
|
if (quote == *bp) {
|
|
|
|
quote = 0;
|
|
|
|
if (!comment && cp < fbuf + MAXLINE)
|
|
|
|
*cp++ = *bp;
|
|
|
|
++bp;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else if (!quote) {
|
|
|
|
quote = *bp++;
|
|
|
|
if (!comment && cp < fbuf + MAXLINE)
|
|
|
|
*cp++ = quote;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
ret = unicode_utf8_to_ucs4(&ucs, bp, &size);
|
2011-03-07 08:02:35 +01:00
|
|
|
|
2015-01-30 20:37:14 +01:00
|
|
|
if (ret != 0) {
|
|
|
|
unicode_warning(bp);
|
|
|
|
break;
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
|
2017-01-10 18:05:48 +01:00
|
|
|
if (iswspace((wint_t)ucs)) {
|
2015-01-30 20:37:14 +01:00
|
|
|
if (!quote) {
|
|
|
|
bp += size;
|
|
|
|
ret = eatwhite(bp, &size);
|
|
|
|
bp += size;
|
|
|
|
if (!comment && *bp && *bp != COMMENT_CHAR && cp < fbuf + MAXLINE)
|
|
|
|
*(cp++) = ' ';
|
|
|
|
if (ret != 0) {
|
|
|
|
unicode_warning(bp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (!comment) {
|
|
|
|
if (cp + size <= fbuf + MAXLINE) {
|
|
|
|
while (size--) {
|
|
|
|
*(cp++) = *(bp++);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
bp += size;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
bp += size;
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
2015-01-30 20:37:14 +01:00
|
|
|
else if (iswcntrl((wint_t)ucs)) {
|
|
|
|
if (!comment && cp < fbuf + MAXLINE) {
|
|
|
|
*cp++ = '?';
|
|
|
|
}
|
|
|
|
bp += size;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (*bp == CONTINUE_CHAR) {
|
|
|
|
const char *end;
|
|
|
|
eatwhite(bp + 1, &white);
|
|
|
|
end = bp + 1 + white;
|
|
|
|
if (*end == '\0') {
|
|
|
|
bp = end;
|
|
|
|
cont = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!comment && cp < fbuf + MAXLINE)
|
|
|
|
*cp++ = *bp++;
|
|
|
|
else
|
|
|
|
++bp;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (!comment && cp + size <= fbuf + MAXLINE) {
|
|
|
|
while (size--) {
|
|
|
|
*(cp++) = *(bp++);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
bp += size;
|
|
|
|
}
|
|
|
|
}
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
}
|
2015-01-30 20:37:14 +01:00
|
|
|
if (cp == fbuf + MAXLINE) {
|
|
|
|
--cp;
|
|
|
|
}
|
|
|
|
*cp = 0;
|
|
|
|
} while (cont || cp == fbuf);
|
|
|
|
return fbuf;
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|
|
|
|
|
2011-03-07 08:02:35 +01:00
|
|
|
const char *getbuf(FILE * F, int encoding)
|
2010-08-08 10:06:34 +02:00
|
|
|
{
|
2015-01-30 20:37:14 +01:00
|
|
|
if (encoding == ENCODING_UTF8)
|
|
|
|
return getbuf_utf8(F);
|
|
|
|
return getbuf_latin1(F);
|
2010-08-08 10:06:34 +02:00
|
|
|
}
|