#include #include "filereader.h" #include #include #include #include #include #define COMMENT_CHAR ';' #define CONTINUE_CHAR '\\' #define MAXLINE 4096*16 static char lbuf[MAXLINE]; static char fbuf[MAXLINE]; static void unicode_warning(const char *bp) { log_warning("invalid sequence in UTF-8 string: %s\n", bp); } static int eatwhite(const char *ptr, size_t * total_size) { int ret = 0; *total_size = 0; while (*ptr) { ucs4_t ucs; size_t size = 0; ret = unicode_utf8_to_ucs4(&ucs, ptr, &size); if (ret != 0) break; if (!iswspace((wint_t)ucs)) break; *total_size += size; ptr += size; } return ret; } static const char *getbuf_utf8(FILE * F) { bool cont = false; char quote = 0; bool comment = false; char *cp = fbuf; char *tail = lbuf + MAXLINE - 2; tail[1] = '@'; /* if this gets overwritten by fgets then the line was very long. */ do { const char *bp = fgets(lbuf, MAXLINE, F); size_t white; if (bp == NULL) { return NULL; } eatwhite(bp, &white); /* decoding errors will get caught later on, don't have to check */ bp += white; comment = (comment && cont); quote = (quote && cont); if (tail[1] == 0) { /* we read the maximum number of bytes! */ if (tail[0] != '\n') { /* it wasn't enough space to finish the line, eat the rest */ for (;;) { tail[1] = '@'; bp = fgets(lbuf, MAXLINE, F); if (bp == NULL) return NULL; if (tail[1]) { /* read enough this time to end the line */ break; } } comment = false; cont = false; bp = NULL; continue; } else { tail[1] = '@'; } } cont = false; while (*bp && cp < fbuf + MAXLINE) { ucs4_t ucs; size_t size; int ret; if (!quote) { while (*bp == COMMENT_CHAR) { /* comment begins. we need to keep going, to look for CONTINUE_CHAR */ comment = true; ++bp; } } if (*bp == '\n' || *bp == '\r') { /* line breaks, shmine breaks */ break; } if (*bp == '"' || *bp == '\'') { if (quote == *bp) { quote = 0; if (!comment && cp < fbuf + MAXLINE) *cp++ = *bp; ++bp; continue; } else if (!quote) { quote = *bp++; if (!comment && cp < fbuf + MAXLINE) *cp++ = quote; continue; } } ret = unicode_utf8_to_ucs4(&ucs, bp, &size); if (ret != 0) { unicode_warning(bp); break; } if (iswspace((wint_t)ucs)) { if (!quote) { bp += size; ret = eatwhite(bp, &size); bp += size; if (!comment && *bp && *bp != COMMENT_CHAR && cp < fbuf + MAXLINE) *(cp++) = ' '; if (ret != 0) { unicode_warning(bp); break; } } else if (!comment) { if (cp + size <= fbuf + MAXLINE) { while (size--) { *(cp++) = *(bp++); } } else bp += size; } else { bp += size; } } else if (iswcntrl((wint_t)ucs)) { if (!comment && cp < fbuf + MAXLINE) { *cp++ = '?'; } bp += size; } else { if (*bp == CONTINUE_CHAR) { const char *handle_end; eatwhite(bp + 1, &white); handle_end = bp + 1 + white; if (*handle_end == '\0') { bp = handle_end; cont = true; continue; } if (!comment && cp < fbuf + MAXLINE) *cp++ = *bp++; else ++bp; } else { if (!comment && cp + size <= fbuf + MAXLINE) { while (size--) { *(cp++) = *(bp++); } } else { bp += size; } } } } if (cp == fbuf + MAXLINE) { --cp; } *cp = 0; } while (cont || cp == fbuf); return fbuf; } const char *getbuf(FILE * F) { return getbuf_utf8(F); }