SAX based XML parser

Dependents:   giken9_HTMLServer_Temp_Sample

Committer:
andrewbonney
Date:
Fri Apr 08 09:18:41 2011 +0000
Revision:
0:07919e3d6c56

        

Who changed what in which revision?

UserRevisionLine numberNew contents of line
andrewbonney 0:07919e3d6c56 1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
andrewbonney 0:07919e3d6c56 2 See the file COPYING for copying permission.
andrewbonney 0:07919e3d6c56 3 */
andrewbonney 0:07919e3d6c56 4
andrewbonney 0:07919e3d6c56 5 #include <stddef.h>
andrewbonney 0:07919e3d6c56 6
andrewbonney 0:07919e3d6c56 7 #ifdef COMPILED_FROM_DSP
andrewbonney 0:07919e3d6c56 8 #include "winconfig.h"
andrewbonney 0:07919e3d6c56 9 #elif defined(MACOS_CLASSIC)
andrewbonney 0:07919e3d6c56 10 #include "macconfig.h"
andrewbonney 0:07919e3d6c56 11 #elif defined(__amigaos4__)
andrewbonney 0:07919e3d6c56 12 #include "amigaconfig.h"
andrewbonney 0:07919e3d6c56 13 #elif defined(__WATCOMC__)
andrewbonney 0:07919e3d6c56 14 #include "watcomconfig.h"
andrewbonney 0:07919e3d6c56 15 #else
andrewbonney 0:07919e3d6c56 16 #ifdef HAVE_EXPAT_CONFIG_H
andrewbonney 0:07919e3d6c56 17 #include <expat_config.h>
andrewbonney 0:07919e3d6c56 18 #endif
andrewbonney 0:07919e3d6c56 19 #endif /* ndef COMPILED_FROM_DSP */
andrewbonney 0:07919e3d6c56 20
andrewbonney 0:07919e3d6c56 21 #include "expat_external.h"
andrewbonney 0:07919e3d6c56 22 #include "internal.h"
andrewbonney 0:07919e3d6c56 23 #include "xmltok.h"
andrewbonney 0:07919e3d6c56 24 #include "nametab.h"
andrewbonney 0:07919e3d6c56 25
andrewbonney 0:07919e3d6c56 26 #ifdef XML_DTD
andrewbonney 0:07919e3d6c56 27 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
andrewbonney 0:07919e3d6c56 28 #else
andrewbonney 0:07919e3d6c56 29 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
andrewbonney 0:07919e3d6c56 30 #endif
andrewbonney 0:07919e3d6c56 31
andrewbonney 0:07919e3d6c56 32 #define VTABLE1 \
andrewbonney 0:07919e3d6c56 33 { PREFIX(prologTok), PREFIX(contentTok), \
andrewbonney 0:07919e3d6c56 34 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
andrewbonney 0:07919e3d6c56 35 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
andrewbonney 0:07919e3d6c56 36 PREFIX(sameName), \
andrewbonney 0:07919e3d6c56 37 PREFIX(nameMatchesAscii), \
andrewbonney 0:07919e3d6c56 38 PREFIX(nameLength), \
andrewbonney 0:07919e3d6c56 39 PREFIX(skipS), \
andrewbonney 0:07919e3d6c56 40 PREFIX(getAtts), \
andrewbonney 0:07919e3d6c56 41 PREFIX(charRefNumber), \
andrewbonney 0:07919e3d6c56 42 PREFIX(predefinedEntityName), \
andrewbonney 0:07919e3d6c56 43 PREFIX(updatePosition), \
andrewbonney 0:07919e3d6c56 44 PREFIX(isPublicId)
andrewbonney 0:07919e3d6c56 45
andrewbonney 0:07919e3d6c56 46 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
andrewbonney 0:07919e3d6c56 47
andrewbonney 0:07919e3d6c56 48 #define UCS2_GET_NAMING(pages, hi, lo) \
andrewbonney 0:07919e3d6c56 49 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
andrewbonney 0:07919e3d6c56 50
andrewbonney 0:07919e3d6c56 51 /* A 2 byte UTF-8 representation splits the characters 11 bits between
andrewbonney 0:07919e3d6c56 52 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
andrewbonney 0:07919e3d6c56 53 pages, 3 bits to add to that index and 5 bits to generate the mask.
andrewbonney 0:07919e3d6c56 54 */
andrewbonney 0:07919e3d6c56 55 #define UTF8_GET_NAMING2(pages, byte) \
andrewbonney 0:07919e3d6c56 56 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
andrewbonney 0:07919e3d6c56 57 + ((((byte)[0]) & 3) << 1) \
andrewbonney 0:07919e3d6c56 58 + ((((byte)[1]) >> 5) & 1)] \
andrewbonney 0:07919e3d6c56 59 & (1 << (((byte)[1]) & 0x1F)))
andrewbonney 0:07919e3d6c56 60
andrewbonney 0:07919e3d6c56 61 /* A 3 byte UTF-8 representation splits the characters 16 bits between
andrewbonney 0:07919e3d6c56 62 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
andrewbonney 0:07919e3d6c56 63 into pages, 3 bits to add to that index and 5 bits to generate the
andrewbonney 0:07919e3d6c56 64 mask.
andrewbonney 0:07919e3d6c56 65 */
andrewbonney 0:07919e3d6c56 66 #define UTF8_GET_NAMING3(pages, byte) \
andrewbonney 0:07919e3d6c56 67 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
andrewbonney 0:07919e3d6c56 68 + ((((byte)[1]) >> 2) & 0xF)] \
andrewbonney 0:07919e3d6c56 69 << 3) \
andrewbonney 0:07919e3d6c56 70 + ((((byte)[1]) & 3) << 1) \
andrewbonney 0:07919e3d6c56 71 + ((((byte)[2]) >> 5) & 1)] \
andrewbonney 0:07919e3d6c56 72 & (1 << (((byte)[2]) & 0x1F)))
andrewbonney 0:07919e3d6c56 73
andrewbonney 0:07919e3d6c56 74 #define UTF8_GET_NAMING(pages, p, n) \
andrewbonney 0:07919e3d6c56 75 ((n) == 2 \
andrewbonney 0:07919e3d6c56 76 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
andrewbonney 0:07919e3d6c56 77 : ((n) == 3 \
andrewbonney 0:07919e3d6c56 78 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
andrewbonney 0:07919e3d6c56 79 : 0))
andrewbonney 0:07919e3d6c56 80
andrewbonney 0:07919e3d6c56 81 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
andrewbonney 0:07919e3d6c56 82 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
andrewbonney 0:07919e3d6c56 83 with the additional restriction of not allowing the Unicode
andrewbonney 0:07919e3d6c56 84 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
andrewbonney 0:07919e3d6c56 85 Implementation details:
andrewbonney 0:07919e3d6c56 86 (A & 0x80) == 0 means A < 0x80
andrewbonney 0:07919e3d6c56 87 and
andrewbonney 0:07919e3d6c56 88 (A & 0xC0) == 0xC0 means A > 0xBF
andrewbonney 0:07919e3d6c56 89 */
andrewbonney 0:07919e3d6c56 90
andrewbonney 0:07919e3d6c56 91 #define UTF8_INVALID2(p) \
andrewbonney 0:07919e3d6c56 92 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
andrewbonney 0:07919e3d6c56 93
andrewbonney 0:07919e3d6c56 94 #define UTF8_INVALID3(p) \
andrewbonney 0:07919e3d6c56 95 (((p)[2] & 0x80) == 0 \
andrewbonney 0:07919e3d6c56 96 || \
andrewbonney 0:07919e3d6c56 97 ((*p) == 0xEF && (p)[1] == 0xBF \
andrewbonney 0:07919e3d6c56 98 ? \
andrewbonney 0:07919e3d6c56 99 (p)[2] > 0xBD \
andrewbonney 0:07919e3d6c56 100 : \
andrewbonney 0:07919e3d6c56 101 ((p)[2] & 0xC0) == 0xC0) \
andrewbonney 0:07919e3d6c56 102 || \
andrewbonney 0:07919e3d6c56 103 ((*p) == 0xE0 \
andrewbonney 0:07919e3d6c56 104 ? \
andrewbonney 0:07919e3d6c56 105 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
andrewbonney 0:07919e3d6c56 106 : \
andrewbonney 0:07919e3d6c56 107 ((p)[1] & 0x80) == 0 \
andrewbonney 0:07919e3d6c56 108 || \
andrewbonney 0:07919e3d6c56 109 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
andrewbonney 0:07919e3d6c56 110
andrewbonney 0:07919e3d6c56 111 #define UTF8_INVALID4(p) \
andrewbonney 0:07919e3d6c56 112 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
andrewbonney 0:07919e3d6c56 113 || \
andrewbonney 0:07919e3d6c56 114 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
andrewbonney 0:07919e3d6c56 115 || \
andrewbonney 0:07919e3d6c56 116 ((*p) == 0xF0 \
andrewbonney 0:07919e3d6c56 117 ? \
andrewbonney 0:07919e3d6c56 118 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
andrewbonney 0:07919e3d6c56 119 : \
andrewbonney 0:07919e3d6c56 120 ((p)[1] & 0x80) == 0 \
andrewbonney 0:07919e3d6c56 121 || \
andrewbonney 0:07919e3d6c56 122 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
andrewbonney 0:07919e3d6c56 123
andrewbonney 0:07919e3d6c56 124 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 125 isNever(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 126 {
andrewbonney 0:07919e3d6c56 127 return 0;
andrewbonney 0:07919e3d6c56 128 }
andrewbonney 0:07919e3d6c56 129
andrewbonney 0:07919e3d6c56 130 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 131 utf8_isName2(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 132 {
andrewbonney 0:07919e3d6c56 133 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
andrewbonney 0:07919e3d6c56 134 }
andrewbonney 0:07919e3d6c56 135
andrewbonney 0:07919e3d6c56 136 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 137 utf8_isName3(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 138 {
andrewbonney 0:07919e3d6c56 139 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
andrewbonney 0:07919e3d6c56 140 }
andrewbonney 0:07919e3d6c56 141
andrewbonney 0:07919e3d6c56 142 #define utf8_isName4 isNever
andrewbonney 0:07919e3d6c56 143
andrewbonney 0:07919e3d6c56 144 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 145 utf8_isNmstrt2(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 146 {
andrewbonney 0:07919e3d6c56 147 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
andrewbonney 0:07919e3d6c56 148 }
andrewbonney 0:07919e3d6c56 149
andrewbonney 0:07919e3d6c56 150 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 151 utf8_isNmstrt3(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 152 {
andrewbonney 0:07919e3d6c56 153 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
andrewbonney 0:07919e3d6c56 154 }
andrewbonney 0:07919e3d6c56 155
andrewbonney 0:07919e3d6c56 156 #define utf8_isNmstrt4 isNever
andrewbonney 0:07919e3d6c56 157
andrewbonney 0:07919e3d6c56 158 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 159 utf8_isInvalid2(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 160 {
andrewbonney 0:07919e3d6c56 161 return UTF8_INVALID2((const unsigned char *)p);
andrewbonney 0:07919e3d6c56 162 }
andrewbonney 0:07919e3d6c56 163
andrewbonney 0:07919e3d6c56 164 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 165 utf8_isInvalid3(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 166 {
andrewbonney 0:07919e3d6c56 167 return UTF8_INVALID3((const unsigned char *)p);
andrewbonney 0:07919e3d6c56 168 }
andrewbonney 0:07919e3d6c56 169
andrewbonney 0:07919e3d6c56 170 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 171 utf8_isInvalid4(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 172 {
andrewbonney 0:07919e3d6c56 173 return UTF8_INVALID4((const unsigned char *)p);
andrewbonney 0:07919e3d6c56 174 }
andrewbonney 0:07919e3d6c56 175
andrewbonney 0:07919e3d6c56 176 struct normal_encoding {
andrewbonney 0:07919e3d6c56 177 ENCODING enc;
andrewbonney 0:07919e3d6c56 178 unsigned char type[256];
andrewbonney 0:07919e3d6c56 179 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 180 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 181 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 182 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 183 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 184 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
andrewbonney 0:07919e3d6c56 185 #endif /* XML_MIN_SIZE */
andrewbonney 0:07919e3d6c56 186 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 187 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 188 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 189 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 190 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 191 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 192 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 193 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 194 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
andrewbonney 0:07919e3d6c56 195 };
andrewbonney 0:07919e3d6c56 196
andrewbonney 0:07919e3d6c56 197 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
andrewbonney 0:07919e3d6c56 198
andrewbonney 0:07919e3d6c56 199 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 200
andrewbonney 0:07919e3d6c56 201 #define STANDARD_VTABLE(E) \
andrewbonney 0:07919e3d6c56 202 E ## byteType, \
andrewbonney 0:07919e3d6c56 203 E ## isNameMin, \
andrewbonney 0:07919e3d6c56 204 E ## isNmstrtMin, \
andrewbonney 0:07919e3d6c56 205 E ## byteToAscii, \
andrewbonney 0:07919e3d6c56 206 E ## charMatches,
andrewbonney 0:07919e3d6c56 207
andrewbonney 0:07919e3d6c56 208 #else
andrewbonney 0:07919e3d6c56 209
andrewbonney 0:07919e3d6c56 210 #define STANDARD_VTABLE(E) /* as nothing */
andrewbonney 0:07919e3d6c56 211
andrewbonney 0:07919e3d6c56 212 #endif
andrewbonney 0:07919e3d6c56 213
andrewbonney 0:07919e3d6c56 214 #define NORMAL_VTABLE(E) \
andrewbonney 0:07919e3d6c56 215 E ## isName2, \
andrewbonney 0:07919e3d6c56 216 E ## isName3, \
andrewbonney 0:07919e3d6c56 217 E ## isName4, \
andrewbonney 0:07919e3d6c56 218 E ## isNmstrt2, \
andrewbonney 0:07919e3d6c56 219 E ## isNmstrt3, \
andrewbonney 0:07919e3d6c56 220 E ## isNmstrt4, \
andrewbonney 0:07919e3d6c56 221 E ## isInvalid2, \
andrewbonney 0:07919e3d6c56 222 E ## isInvalid3, \
andrewbonney 0:07919e3d6c56 223 E ## isInvalid4
andrewbonney 0:07919e3d6c56 224
andrewbonney 0:07919e3d6c56 225 static int FASTCALL checkCharRefNumber(int);
andrewbonney 0:07919e3d6c56 226
andrewbonney 0:07919e3d6c56 227 #include "xmltok_impl.h"
andrewbonney 0:07919e3d6c56 228 #include "ascii.h"
andrewbonney 0:07919e3d6c56 229
andrewbonney 0:07919e3d6c56 230 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 231 #define sb_isNameMin isNever
andrewbonney 0:07919e3d6c56 232 #define sb_isNmstrtMin isNever
andrewbonney 0:07919e3d6c56 233 #endif
andrewbonney 0:07919e3d6c56 234
andrewbonney 0:07919e3d6c56 235 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 236 #define MINBPC(enc) ((enc)->minBytesPerChar)
andrewbonney 0:07919e3d6c56 237 #else
andrewbonney 0:07919e3d6c56 238 /* minimum bytes per character */
andrewbonney 0:07919e3d6c56 239 #define MINBPC(enc) 1
andrewbonney 0:07919e3d6c56 240 #endif
andrewbonney 0:07919e3d6c56 241
andrewbonney 0:07919e3d6c56 242 #define SB_BYTE_TYPE(enc, p) \
andrewbonney 0:07919e3d6c56 243 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
andrewbonney 0:07919e3d6c56 244
andrewbonney 0:07919e3d6c56 245 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 246 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 247 sb_byteType(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 248 {
andrewbonney 0:07919e3d6c56 249 return SB_BYTE_TYPE(enc, p);
andrewbonney 0:07919e3d6c56 250 }
andrewbonney 0:07919e3d6c56 251 #define BYTE_TYPE(enc, p) \
andrewbonney 0:07919e3d6c56 252 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
andrewbonney 0:07919e3d6c56 253 #else
andrewbonney 0:07919e3d6c56 254 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
andrewbonney 0:07919e3d6c56 255 #endif
andrewbonney 0:07919e3d6c56 256
andrewbonney 0:07919e3d6c56 257 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 258 #define BYTE_TO_ASCII(enc, p) \
andrewbonney 0:07919e3d6c56 259 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
andrewbonney 0:07919e3d6c56 260 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 261 sb_byteToAscii(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 262 {
andrewbonney 0:07919e3d6c56 263 return *p;
andrewbonney 0:07919e3d6c56 264 }
andrewbonney 0:07919e3d6c56 265 #else
andrewbonney 0:07919e3d6c56 266 #define BYTE_TO_ASCII(enc, p) (*(p))
andrewbonney 0:07919e3d6c56 267 #endif
andrewbonney 0:07919e3d6c56 268
andrewbonney 0:07919e3d6c56 269 #define IS_NAME_CHAR(enc, p, n) \
andrewbonney 0:07919e3d6c56 270 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
andrewbonney 0:07919e3d6c56 271 #define IS_NMSTRT_CHAR(enc, p, n) \
andrewbonney 0:07919e3d6c56 272 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
andrewbonney 0:07919e3d6c56 273 #define IS_INVALID_CHAR(enc, p, n) \
andrewbonney 0:07919e3d6c56 274 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
andrewbonney 0:07919e3d6c56 275
andrewbonney 0:07919e3d6c56 276 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 277 #define IS_NAME_CHAR_MINBPC(enc, p) \
andrewbonney 0:07919e3d6c56 278 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
andrewbonney 0:07919e3d6c56 279 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
andrewbonney 0:07919e3d6c56 280 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
andrewbonney 0:07919e3d6c56 281 #else
andrewbonney 0:07919e3d6c56 282 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
andrewbonney 0:07919e3d6c56 283 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
andrewbonney 0:07919e3d6c56 284 #endif
andrewbonney 0:07919e3d6c56 285
andrewbonney 0:07919e3d6c56 286 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 287 #define CHAR_MATCHES(enc, p, c) \
andrewbonney 0:07919e3d6c56 288 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
andrewbonney 0:07919e3d6c56 289 static int PTRCALL
andrewbonney 0:07919e3d6c56 290 sb_charMatches(const ENCODING *enc, const char *p, int c)
andrewbonney 0:07919e3d6c56 291 {
andrewbonney 0:07919e3d6c56 292 return *p == c;
andrewbonney 0:07919e3d6c56 293 }
andrewbonney 0:07919e3d6c56 294 #else
andrewbonney 0:07919e3d6c56 295 /* c is an ASCII character */
andrewbonney 0:07919e3d6c56 296 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
andrewbonney 0:07919e3d6c56 297 #endif
andrewbonney 0:07919e3d6c56 298
andrewbonney 0:07919e3d6c56 299 #define PREFIX(ident) normal_ ## ident
andrewbonney 0:07919e3d6c56 300 #define XML_TOK_IMPL_C
andrewbonney 0:07919e3d6c56 301 #include "xmltok_impl.c"
andrewbonney 0:07919e3d6c56 302 #undef XML_TOK_IMPL_C
andrewbonney 0:07919e3d6c56 303
andrewbonney 0:07919e3d6c56 304 #undef MINBPC
andrewbonney 0:07919e3d6c56 305 #undef BYTE_TYPE
andrewbonney 0:07919e3d6c56 306 #undef BYTE_TO_ASCII
andrewbonney 0:07919e3d6c56 307 #undef CHAR_MATCHES
andrewbonney 0:07919e3d6c56 308 #undef IS_NAME_CHAR
andrewbonney 0:07919e3d6c56 309 #undef IS_NAME_CHAR_MINBPC
andrewbonney 0:07919e3d6c56 310 #undef IS_NMSTRT_CHAR
andrewbonney 0:07919e3d6c56 311 #undef IS_NMSTRT_CHAR_MINBPC
andrewbonney 0:07919e3d6c56 312 #undef IS_INVALID_CHAR
andrewbonney 0:07919e3d6c56 313
andrewbonney 0:07919e3d6c56 314 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
andrewbonney 0:07919e3d6c56 315 UTF8_cval1 = 0x00,
andrewbonney 0:07919e3d6c56 316 UTF8_cval2 = 0xc0,
andrewbonney 0:07919e3d6c56 317 UTF8_cval3 = 0xe0,
andrewbonney 0:07919e3d6c56 318 UTF8_cval4 = 0xf0
andrewbonney 0:07919e3d6c56 319 };
andrewbonney 0:07919e3d6c56 320
andrewbonney 0:07919e3d6c56 321 static void PTRCALL
andrewbonney 0:07919e3d6c56 322 utf8_toUtf8(const ENCODING *enc,
andrewbonney 0:07919e3d6c56 323 const char **fromP, const char *fromLim,
andrewbonney 0:07919e3d6c56 324 char **toP, const char *toLim)
andrewbonney 0:07919e3d6c56 325 {
andrewbonney 0:07919e3d6c56 326 char *to;
andrewbonney 0:07919e3d6c56 327 const char *from;
andrewbonney 0:07919e3d6c56 328 if (fromLim - *fromP > toLim - *toP) {
andrewbonney 0:07919e3d6c56 329 /* Avoid copying partial characters. */
andrewbonney 0:07919e3d6c56 330 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
andrewbonney 0:07919e3d6c56 331 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
andrewbonney 0:07919e3d6c56 332 break;
andrewbonney 0:07919e3d6c56 333 }
andrewbonney 0:07919e3d6c56 334 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
andrewbonney 0:07919e3d6c56 335 *to = *from;
andrewbonney 0:07919e3d6c56 336 *fromP = from;
andrewbonney 0:07919e3d6c56 337 *toP = to;
andrewbonney 0:07919e3d6c56 338 }
andrewbonney 0:07919e3d6c56 339
andrewbonney 0:07919e3d6c56 340 static void PTRCALL
andrewbonney 0:07919e3d6c56 341 utf8_toUtf16(const ENCODING *enc,
andrewbonney 0:07919e3d6c56 342 const char **fromP, const char *fromLim,
andrewbonney 0:07919e3d6c56 343 unsigned short **toP, const unsigned short *toLim)
andrewbonney 0:07919e3d6c56 344 {
andrewbonney 0:07919e3d6c56 345 unsigned short *to = *toP;
andrewbonney 0:07919e3d6c56 346 const char *from = *fromP;
andrewbonney 0:07919e3d6c56 347 while (from != fromLim && to != toLim) {
andrewbonney 0:07919e3d6c56 348 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
andrewbonney 0:07919e3d6c56 349 case BT_LEAD2:
andrewbonney 0:07919e3d6c56 350 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
andrewbonney 0:07919e3d6c56 351 from += 2;
andrewbonney 0:07919e3d6c56 352 break;
andrewbonney 0:07919e3d6c56 353 case BT_LEAD3:
andrewbonney 0:07919e3d6c56 354 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
andrewbonney 0:07919e3d6c56 355 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
andrewbonney 0:07919e3d6c56 356 from += 3;
andrewbonney 0:07919e3d6c56 357 break;
andrewbonney 0:07919e3d6c56 358 case BT_LEAD4:
andrewbonney 0:07919e3d6c56 359 {
andrewbonney 0:07919e3d6c56 360 unsigned long n;
andrewbonney 0:07919e3d6c56 361 if (to + 1 == toLim)
andrewbonney 0:07919e3d6c56 362 goto after;
andrewbonney 0:07919e3d6c56 363 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
andrewbonney 0:07919e3d6c56 364 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
andrewbonney 0:07919e3d6c56 365 n -= 0x10000;
andrewbonney 0:07919e3d6c56 366 to[0] = (unsigned short)((n >> 10) | 0xD800);
andrewbonney 0:07919e3d6c56 367 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
andrewbonney 0:07919e3d6c56 368 to += 2;
andrewbonney 0:07919e3d6c56 369 from += 4;
andrewbonney 0:07919e3d6c56 370 }
andrewbonney 0:07919e3d6c56 371 break;
andrewbonney 0:07919e3d6c56 372 default:
andrewbonney 0:07919e3d6c56 373 *to++ = *from++;
andrewbonney 0:07919e3d6c56 374 break;
andrewbonney 0:07919e3d6c56 375 }
andrewbonney 0:07919e3d6c56 376 }
andrewbonney 0:07919e3d6c56 377 after:
andrewbonney 0:07919e3d6c56 378 *fromP = from;
andrewbonney 0:07919e3d6c56 379 *toP = to;
andrewbonney 0:07919e3d6c56 380 }
andrewbonney 0:07919e3d6c56 381
andrewbonney 0:07919e3d6c56 382 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 383 static const struct normal_encoding utf8_encoding_ns = {
andrewbonney 0:07919e3d6c56 384 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
andrewbonney 0:07919e3d6c56 385 {
andrewbonney 0:07919e3d6c56 386 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 387 #include "utf8tab.h"
andrewbonney 0:07919e3d6c56 388 },
andrewbonney 0:07919e3d6c56 389 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
andrewbonney 0:07919e3d6c56 390 };
andrewbonney 0:07919e3d6c56 391 #endif
andrewbonney 0:07919e3d6c56 392
andrewbonney 0:07919e3d6c56 393 static const struct normal_encoding utf8_encoding = {
andrewbonney 0:07919e3d6c56 394 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
andrewbonney 0:07919e3d6c56 395 {
andrewbonney 0:07919e3d6c56 396 #define BT_COLON BT_NMSTRT
andrewbonney 0:07919e3d6c56 397 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 398 #undef BT_COLON
andrewbonney 0:07919e3d6c56 399 #include "utf8tab.h"
andrewbonney 0:07919e3d6c56 400 },
andrewbonney 0:07919e3d6c56 401 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
andrewbonney 0:07919e3d6c56 402 };
andrewbonney 0:07919e3d6c56 403
andrewbonney 0:07919e3d6c56 404 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 405
andrewbonney 0:07919e3d6c56 406 static const struct normal_encoding internal_utf8_encoding_ns = {
andrewbonney 0:07919e3d6c56 407 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
andrewbonney 0:07919e3d6c56 408 {
andrewbonney 0:07919e3d6c56 409 #include "iasciitab.h"
andrewbonney 0:07919e3d6c56 410 #include "utf8tab.h"
andrewbonney 0:07919e3d6c56 411 },
andrewbonney 0:07919e3d6c56 412 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
andrewbonney 0:07919e3d6c56 413 };
andrewbonney 0:07919e3d6c56 414
andrewbonney 0:07919e3d6c56 415 #endif
andrewbonney 0:07919e3d6c56 416
andrewbonney 0:07919e3d6c56 417 static const struct normal_encoding internal_utf8_encoding = {
andrewbonney 0:07919e3d6c56 418 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
andrewbonney 0:07919e3d6c56 419 {
andrewbonney 0:07919e3d6c56 420 #define BT_COLON BT_NMSTRT
andrewbonney 0:07919e3d6c56 421 #include "iasciitab.h"
andrewbonney 0:07919e3d6c56 422 #undef BT_COLON
andrewbonney 0:07919e3d6c56 423 #include "utf8tab.h"
andrewbonney 0:07919e3d6c56 424 },
andrewbonney 0:07919e3d6c56 425 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
andrewbonney 0:07919e3d6c56 426 };
andrewbonney 0:07919e3d6c56 427
andrewbonney 0:07919e3d6c56 428 static void PTRCALL
andrewbonney 0:07919e3d6c56 429 latin1_toUtf8(const ENCODING *enc,
andrewbonney 0:07919e3d6c56 430 const char **fromP, const char *fromLim,
andrewbonney 0:07919e3d6c56 431 char **toP, const char *toLim)
andrewbonney 0:07919e3d6c56 432 {
andrewbonney 0:07919e3d6c56 433 for (;;) {
andrewbonney 0:07919e3d6c56 434 unsigned char c;
andrewbonney 0:07919e3d6c56 435 if (*fromP == fromLim)
andrewbonney 0:07919e3d6c56 436 break;
andrewbonney 0:07919e3d6c56 437 c = (unsigned char)**fromP;
andrewbonney 0:07919e3d6c56 438 if (c & 0x80) {
andrewbonney 0:07919e3d6c56 439 if (toLim - *toP < 2)
andrewbonney 0:07919e3d6c56 440 break;
andrewbonney 0:07919e3d6c56 441 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
andrewbonney 0:07919e3d6c56 442 *(*toP)++ = (char)((c & 0x3f) | 0x80);
andrewbonney 0:07919e3d6c56 443 (*fromP)++;
andrewbonney 0:07919e3d6c56 444 }
andrewbonney 0:07919e3d6c56 445 else {
andrewbonney 0:07919e3d6c56 446 if (*toP == toLim)
andrewbonney 0:07919e3d6c56 447 break;
andrewbonney 0:07919e3d6c56 448 *(*toP)++ = *(*fromP)++;
andrewbonney 0:07919e3d6c56 449 }
andrewbonney 0:07919e3d6c56 450 }
andrewbonney 0:07919e3d6c56 451 }
andrewbonney 0:07919e3d6c56 452
andrewbonney 0:07919e3d6c56 453 static void PTRCALL
andrewbonney 0:07919e3d6c56 454 latin1_toUtf16(const ENCODING *enc,
andrewbonney 0:07919e3d6c56 455 const char **fromP, const char *fromLim,
andrewbonney 0:07919e3d6c56 456 unsigned short **toP, const unsigned short *toLim)
andrewbonney 0:07919e3d6c56 457 {
andrewbonney 0:07919e3d6c56 458 while (*fromP != fromLim && *toP != toLim)
andrewbonney 0:07919e3d6c56 459 *(*toP)++ = (unsigned char)*(*fromP)++;
andrewbonney 0:07919e3d6c56 460 }
andrewbonney 0:07919e3d6c56 461
andrewbonney 0:07919e3d6c56 462 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 463
andrewbonney 0:07919e3d6c56 464 static const struct normal_encoding latin1_encoding_ns = {
andrewbonney 0:07919e3d6c56 465 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
andrewbonney 0:07919e3d6c56 466 {
andrewbonney 0:07919e3d6c56 467 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 468 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 469 },
andrewbonney 0:07919e3d6c56 470 STANDARD_VTABLE(sb_)
andrewbonney 0:07919e3d6c56 471 };
andrewbonney 0:07919e3d6c56 472
andrewbonney 0:07919e3d6c56 473 #endif
andrewbonney 0:07919e3d6c56 474
andrewbonney 0:07919e3d6c56 475 static const struct normal_encoding latin1_encoding = {
andrewbonney 0:07919e3d6c56 476 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
andrewbonney 0:07919e3d6c56 477 {
andrewbonney 0:07919e3d6c56 478 #define BT_COLON BT_NMSTRT
andrewbonney 0:07919e3d6c56 479 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 480 #undef BT_COLON
andrewbonney 0:07919e3d6c56 481 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 482 },
andrewbonney 0:07919e3d6c56 483 STANDARD_VTABLE(sb_)
andrewbonney 0:07919e3d6c56 484 };
andrewbonney 0:07919e3d6c56 485
andrewbonney 0:07919e3d6c56 486 static void PTRCALL
andrewbonney 0:07919e3d6c56 487 ascii_toUtf8(const ENCODING *enc,
andrewbonney 0:07919e3d6c56 488 const char **fromP, const char *fromLim,
andrewbonney 0:07919e3d6c56 489 char **toP, const char *toLim)
andrewbonney 0:07919e3d6c56 490 {
andrewbonney 0:07919e3d6c56 491 while (*fromP != fromLim && *toP != toLim)
andrewbonney 0:07919e3d6c56 492 *(*toP)++ = *(*fromP)++;
andrewbonney 0:07919e3d6c56 493 }
andrewbonney 0:07919e3d6c56 494
andrewbonney 0:07919e3d6c56 495 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 496
andrewbonney 0:07919e3d6c56 497 static const struct normal_encoding ascii_encoding_ns = {
andrewbonney 0:07919e3d6c56 498 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
andrewbonney 0:07919e3d6c56 499 {
andrewbonney 0:07919e3d6c56 500 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 501 /* BT_NONXML == 0 */
andrewbonney 0:07919e3d6c56 502 },
andrewbonney 0:07919e3d6c56 503 STANDARD_VTABLE(sb_)
andrewbonney 0:07919e3d6c56 504 };
andrewbonney 0:07919e3d6c56 505
andrewbonney 0:07919e3d6c56 506 #endif
andrewbonney 0:07919e3d6c56 507
andrewbonney 0:07919e3d6c56 508 static const struct normal_encoding ascii_encoding = {
andrewbonney 0:07919e3d6c56 509 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
andrewbonney 0:07919e3d6c56 510 {
andrewbonney 0:07919e3d6c56 511 #define BT_COLON BT_NMSTRT
andrewbonney 0:07919e3d6c56 512 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 513 #undef BT_COLON
andrewbonney 0:07919e3d6c56 514 /* BT_NONXML == 0 */
andrewbonney 0:07919e3d6c56 515 },
andrewbonney 0:07919e3d6c56 516 STANDARD_VTABLE(sb_)
andrewbonney 0:07919e3d6c56 517 };
andrewbonney 0:07919e3d6c56 518
andrewbonney 0:07919e3d6c56 519 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 520 unicode_byte_type(char hi, char lo)
andrewbonney 0:07919e3d6c56 521 {
andrewbonney 0:07919e3d6c56 522 switch ((unsigned char)hi) {
andrewbonney 0:07919e3d6c56 523 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
andrewbonney 0:07919e3d6c56 524 return BT_LEAD4;
andrewbonney 0:07919e3d6c56 525 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
andrewbonney 0:07919e3d6c56 526 return BT_TRAIL;
andrewbonney 0:07919e3d6c56 527 case 0xFF:
andrewbonney 0:07919e3d6c56 528 switch ((unsigned char)lo) {
andrewbonney 0:07919e3d6c56 529 case 0xFF:
andrewbonney 0:07919e3d6c56 530 case 0xFE:
andrewbonney 0:07919e3d6c56 531 return BT_NONXML;
andrewbonney 0:07919e3d6c56 532 }
andrewbonney 0:07919e3d6c56 533 break;
andrewbonney 0:07919e3d6c56 534 }
andrewbonney 0:07919e3d6c56 535 return BT_NONASCII;
andrewbonney 0:07919e3d6c56 536 }
andrewbonney 0:07919e3d6c56 537
andrewbonney 0:07919e3d6c56 538 #define DEFINE_UTF16_TO_UTF8(E) \
andrewbonney 0:07919e3d6c56 539 static void PTRCALL \
andrewbonney 0:07919e3d6c56 540 E ## toUtf8(const ENCODING *enc, \
andrewbonney 0:07919e3d6c56 541 const char **fromP, const char *fromLim, \
andrewbonney 0:07919e3d6c56 542 char **toP, const char *toLim) \
andrewbonney 0:07919e3d6c56 543 { \
andrewbonney 0:07919e3d6c56 544 const char *from; \
andrewbonney 0:07919e3d6c56 545 for (from = *fromP; from != fromLim; from += 2) { \
andrewbonney 0:07919e3d6c56 546 int plane; \
andrewbonney 0:07919e3d6c56 547 unsigned char lo2; \
andrewbonney 0:07919e3d6c56 548 unsigned char lo = GET_LO(from); \
andrewbonney 0:07919e3d6c56 549 unsigned char hi = GET_HI(from); \
andrewbonney 0:07919e3d6c56 550 switch (hi) { \
andrewbonney 0:07919e3d6c56 551 case 0: \
andrewbonney 0:07919e3d6c56 552 if (lo < 0x80) { \
andrewbonney 0:07919e3d6c56 553 if (*toP == toLim) { \
andrewbonney 0:07919e3d6c56 554 *fromP = from; \
andrewbonney 0:07919e3d6c56 555 return; \
andrewbonney 0:07919e3d6c56 556 } \
andrewbonney 0:07919e3d6c56 557 *(*toP)++ = lo; \
andrewbonney 0:07919e3d6c56 558 break; \
andrewbonney 0:07919e3d6c56 559 } \
andrewbonney 0:07919e3d6c56 560 /* fall through */ \
andrewbonney 0:07919e3d6c56 561 case 0x1: case 0x2: case 0x3: \
andrewbonney 0:07919e3d6c56 562 case 0x4: case 0x5: case 0x6: case 0x7: \
andrewbonney 0:07919e3d6c56 563 if (toLim - *toP < 2) { \
andrewbonney 0:07919e3d6c56 564 *fromP = from; \
andrewbonney 0:07919e3d6c56 565 return; \
andrewbonney 0:07919e3d6c56 566 } \
andrewbonney 0:07919e3d6c56 567 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
andrewbonney 0:07919e3d6c56 568 *(*toP)++ = ((lo & 0x3f) | 0x80); \
andrewbonney 0:07919e3d6c56 569 break; \
andrewbonney 0:07919e3d6c56 570 default: \
andrewbonney 0:07919e3d6c56 571 if (toLim - *toP < 3) { \
andrewbonney 0:07919e3d6c56 572 *fromP = from; \
andrewbonney 0:07919e3d6c56 573 return; \
andrewbonney 0:07919e3d6c56 574 } \
andrewbonney 0:07919e3d6c56 575 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
andrewbonney 0:07919e3d6c56 576 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
andrewbonney 0:07919e3d6c56 577 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
andrewbonney 0:07919e3d6c56 578 *(*toP)++ = ((lo & 0x3f) | 0x80); \
andrewbonney 0:07919e3d6c56 579 break; \
andrewbonney 0:07919e3d6c56 580 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
andrewbonney 0:07919e3d6c56 581 if (toLim - *toP < 4) { \
andrewbonney 0:07919e3d6c56 582 *fromP = from; \
andrewbonney 0:07919e3d6c56 583 return; \
andrewbonney 0:07919e3d6c56 584 } \
andrewbonney 0:07919e3d6c56 585 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
andrewbonney 0:07919e3d6c56 586 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
andrewbonney 0:07919e3d6c56 587 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
andrewbonney 0:07919e3d6c56 588 from += 2; \
andrewbonney 0:07919e3d6c56 589 lo2 = GET_LO(from); \
andrewbonney 0:07919e3d6c56 590 *(*toP)++ = (((lo & 0x3) << 4) \
andrewbonney 0:07919e3d6c56 591 | ((GET_HI(from) & 0x3) << 2) \
andrewbonney 0:07919e3d6c56 592 | (lo2 >> 6) \
andrewbonney 0:07919e3d6c56 593 | 0x80); \
andrewbonney 0:07919e3d6c56 594 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
andrewbonney 0:07919e3d6c56 595 break; \
andrewbonney 0:07919e3d6c56 596 } \
andrewbonney 0:07919e3d6c56 597 } \
andrewbonney 0:07919e3d6c56 598 *fromP = from; \
andrewbonney 0:07919e3d6c56 599 }
andrewbonney 0:07919e3d6c56 600
andrewbonney 0:07919e3d6c56 601 #define DEFINE_UTF16_TO_UTF16(E) \
andrewbonney 0:07919e3d6c56 602 static void PTRCALL \
andrewbonney 0:07919e3d6c56 603 E ## toUtf16(const ENCODING *enc, \
andrewbonney 0:07919e3d6c56 604 const char **fromP, const char *fromLim, \
andrewbonney 0:07919e3d6c56 605 unsigned short **toP, const unsigned short *toLim) \
andrewbonney 0:07919e3d6c56 606 { \
andrewbonney 0:07919e3d6c56 607 /* Avoid copying first half only of surrogate */ \
andrewbonney 0:07919e3d6c56 608 if (fromLim - *fromP > ((toLim - *toP) << 1) \
andrewbonney 0:07919e3d6c56 609 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
andrewbonney 0:07919e3d6c56 610 fromLim -= 2; \
andrewbonney 0:07919e3d6c56 611 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
andrewbonney 0:07919e3d6c56 612 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
andrewbonney 0:07919e3d6c56 613 }
andrewbonney 0:07919e3d6c56 614
andrewbonney 0:07919e3d6c56 615 #define SET2(ptr, ch) \
andrewbonney 0:07919e3d6c56 616 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
andrewbonney 0:07919e3d6c56 617 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
andrewbonney 0:07919e3d6c56 618 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
andrewbonney 0:07919e3d6c56 619
andrewbonney 0:07919e3d6c56 620 DEFINE_UTF16_TO_UTF8(little2_)
andrewbonney 0:07919e3d6c56 621 DEFINE_UTF16_TO_UTF16(little2_)
andrewbonney 0:07919e3d6c56 622
andrewbonney 0:07919e3d6c56 623 #undef SET2
andrewbonney 0:07919e3d6c56 624 #undef GET_LO
andrewbonney 0:07919e3d6c56 625 #undef GET_HI
andrewbonney 0:07919e3d6c56 626
andrewbonney 0:07919e3d6c56 627 #define SET2(ptr, ch) \
andrewbonney 0:07919e3d6c56 628 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
andrewbonney 0:07919e3d6c56 629 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
andrewbonney 0:07919e3d6c56 630 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
andrewbonney 0:07919e3d6c56 631
andrewbonney 0:07919e3d6c56 632 DEFINE_UTF16_TO_UTF8(big2_)
andrewbonney 0:07919e3d6c56 633 DEFINE_UTF16_TO_UTF16(big2_)
andrewbonney 0:07919e3d6c56 634
andrewbonney 0:07919e3d6c56 635 #undef SET2
andrewbonney 0:07919e3d6c56 636 #undef GET_LO
andrewbonney 0:07919e3d6c56 637 #undef GET_HI
andrewbonney 0:07919e3d6c56 638
andrewbonney 0:07919e3d6c56 639 #define LITTLE2_BYTE_TYPE(enc, p) \
andrewbonney 0:07919e3d6c56 640 ((p)[1] == 0 \
andrewbonney 0:07919e3d6c56 641 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
andrewbonney 0:07919e3d6c56 642 : unicode_byte_type((p)[1], (p)[0]))
andrewbonney 0:07919e3d6c56 643 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
andrewbonney 0:07919e3d6c56 644 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
andrewbonney 0:07919e3d6c56 645 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
andrewbonney 0:07919e3d6c56 646 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
andrewbonney 0:07919e3d6c56 647 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
andrewbonney 0:07919e3d6c56 648 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
andrewbonney 0:07919e3d6c56 649
andrewbonney 0:07919e3d6c56 650 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 651
andrewbonney 0:07919e3d6c56 652 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 653 little2_byteType(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 654 {
andrewbonney 0:07919e3d6c56 655 return LITTLE2_BYTE_TYPE(enc, p);
andrewbonney 0:07919e3d6c56 656 }
andrewbonney 0:07919e3d6c56 657
andrewbonney 0:07919e3d6c56 658 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 659 little2_byteToAscii(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 660 {
andrewbonney 0:07919e3d6c56 661 return LITTLE2_BYTE_TO_ASCII(enc, p);
andrewbonney 0:07919e3d6c56 662 }
andrewbonney 0:07919e3d6c56 663
andrewbonney 0:07919e3d6c56 664 static int PTRCALL
andrewbonney 0:07919e3d6c56 665 little2_charMatches(const ENCODING *enc, const char *p, int c)
andrewbonney 0:07919e3d6c56 666 {
andrewbonney 0:07919e3d6c56 667 return LITTLE2_CHAR_MATCHES(enc, p, c);
andrewbonney 0:07919e3d6c56 668 }
andrewbonney 0:07919e3d6c56 669
andrewbonney 0:07919e3d6c56 670 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 671 little2_isNameMin(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 672 {
andrewbonney 0:07919e3d6c56 673 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
andrewbonney 0:07919e3d6c56 674 }
andrewbonney 0:07919e3d6c56 675
andrewbonney 0:07919e3d6c56 676 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 677 little2_isNmstrtMin(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 678 {
andrewbonney 0:07919e3d6c56 679 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
andrewbonney 0:07919e3d6c56 680 }
andrewbonney 0:07919e3d6c56 681
andrewbonney 0:07919e3d6c56 682 #undef VTABLE
andrewbonney 0:07919e3d6c56 683 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
andrewbonney 0:07919e3d6c56 684
andrewbonney 0:07919e3d6c56 685 #else /* not XML_MIN_SIZE */
andrewbonney 0:07919e3d6c56 686
andrewbonney 0:07919e3d6c56 687 #undef PREFIX
andrewbonney 0:07919e3d6c56 688 #define PREFIX(ident) little2_ ## ident
andrewbonney 0:07919e3d6c56 689 #define MINBPC(enc) 2
andrewbonney 0:07919e3d6c56 690 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
andrewbonney 0:07919e3d6c56 691 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
andrewbonney 0:07919e3d6c56 692 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
andrewbonney 0:07919e3d6c56 693 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
andrewbonney 0:07919e3d6c56 694 #define IS_NAME_CHAR(enc, p, n) 0
andrewbonney 0:07919e3d6c56 695 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
andrewbonney 0:07919e3d6c56 696 #define IS_NMSTRT_CHAR(enc, p, n) (0)
andrewbonney 0:07919e3d6c56 697 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
andrewbonney 0:07919e3d6c56 698
andrewbonney 0:07919e3d6c56 699 #define XML_TOK_IMPL_C
andrewbonney 0:07919e3d6c56 700 #include "xmltok_impl.c"
andrewbonney 0:07919e3d6c56 701 #undef XML_TOK_IMPL_C
andrewbonney 0:07919e3d6c56 702
andrewbonney 0:07919e3d6c56 703 #undef MINBPC
andrewbonney 0:07919e3d6c56 704 #undef BYTE_TYPE
andrewbonney 0:07919e3d6c56 705 #undef BYTE_TO_ASCII
andrewbonney 0:07919e3d6c56 706 #undef CHAR_MATCHES
andrewbonney 0:07919e3d6c56 707 #undef IS_NAME_CHAR
andrewbonney 0:07919e3d6c56 708 #undef IS_NAME_CHAR_MINBPC
andrewbonney 0:07919e3d6c56 709 #undef IS_NMSTRT_CHAR
andrewbonney 0:07919e3d6c56 710 #undef IS_NMSTRT_CHAR_MINBPC
andrewbonney 0:07919e3d6c56 711 #undef IS_INVALID_CHAR
andrewbonney 0:07919e3d6c56 712
andrewbonney 0:07919e3d6c56 713 #endif /* not XML_MIN_SIZE */
andrewbonney 0:07919e3d6c56 714
andrewbonney 0:07919e3d6c56 715 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 716
andrewbonney 0:07919e3d6c56 717 static const struct normal_encoding little2_encoding_ns = {
andrewbonney 0:07919e3d6c56 718 { VTABLE, 2, 0,
andrewbonney 0:07919e3d6c56 719 #if BYTEORDER == 1234
andrewbonney 0:07919e3d6c56 720 1
andrewbonney 0:07919e3d6c56 721 #else
andrewbonney 0:07919e3d6c56 722 0
andrewbonney 0:07919e3d6c56 723 #endif
andrewbonney 0:07919e3d6c56 724 },
andrewbonney 0:07919e3d6c56 725 {
andrewbonney 0:07919e3d6c56 726 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 727 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 728 },
andrewbonney 0:07919e3d6c56 729 STANDARD_VTABLE(little2_)
andrewbonney 0:07919e3d6c56 730 };
andrewbonney 0:07919e3d6c56 731
andrewbonney 0:07919e3d6c56 732 #endif
andrewbonney 0:07919e3d6c56 733
andrewbonney 0:07919e3d6c56 734 static const struct normal_encoding little2_encoding = {
andrewbonney 0:07919e3d6c56 735 { VTABLE, 2, 0,
andrewbonney 0:07919e3d6c56 736 #if BYTEORDER == 1234
andrewbonney 0:07919e3d6c56 737 1
andrewbonney 0:07919e3d6c56 738 #else
andrewbonney 0:07919e3d6c56 739 0
andrewbonney 0:07919e3d6c56 740 #endif
andrewbonney 0:07919e3d6c56 741 },
andrewbonney 0:07919e3d6c56 742 {
andrewbonney 0:07919e3d6c56 743 #define BT_COLON BT_NMSTRT
andrewbonney 0:07919e3d6c56 744 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 745 #undef BT_COLON
andrewbonney 0:07919e3d6c56 746 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 747 },
andrewbonney 0:07919e3d6c56 748 STANDARD_VTABLE(little2_)
andrewbonney 0:07919e3d6c56 749 };
andrewbonney 0:07919e3d6c56 750
andrewbonney 0:07919e3d6c56 751 #if BYTEORDER != 4321
andrewbonney 0:07919e3d6c56 752
andrewbonney 0:07919e3d6c56 753 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 754
andrewbonney 0:07919e3d6c56 755 static const struct normal_encoding internal_little2_encoding_ns = {
andrewbonney 0:07919e3d6c56 756 { VTABLE, 2, 0, 1 },
andrewbonney 0:07919e3d6c56 757 {
andrewbonney 0:07919e3d6c56 758 #include "iasciitab.h"
andrewbonney 0:07919e3d6c56 759 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 760 },
andrewbonney 0:07919e3d6c56 761 STANDARD_VTABLE(little2_)
andrewbonney 0:07919e3d6c56 762 };
andrewbonney 0:07919e3d6c56 763
andrewbonney 0:07919e3d6c56 764 #endif
andrewbonney 0:07919e3d6c56 765
andrewbonney 0:07919e3d6c56 766 static const struct normal_encoding internal_little2_encoding = {
andrewbonney 0:07919e3d6c56 767 { VTABLE, 2, 0, 1 },
andrewbonney 0:07919e3d6c56 768 {
andrewbonney 0:07919e3d6c56 769 #define BT_COLON BT_NMSTRT
andrewbonney 0:07919e3d6c56 770 #include "iasciitab.h"
andrewbonney 0:07919e3d6c56 771 #undef BT_COLON
andrewbonney 0:07919e3d6c56 772 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 773 },
andrewbonney 0:07919e3d6c56 774 STANDARD_VTABLE(little2_)
andrewbonney 0:07919e3d6c56 775 };
andrewbonney 0:07919e3d6c56 776
andrewbonney 0:07919e3d6c56 777 #endif
andrewbonney 0:07919e3d6c56 778
andrewbonney 0:07919e3d6c56 779
andrewbonney 0:07919e3d6c56 780 #define BIG2_BYTE_TYPE(enc, p) \
andrewbonney 0:07919e3d6c56 781 ((p)[0] == 0 \
andrewbonney 0:07919e3d6c56 782 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
andrewbonney 0:07919e3d6c56 783 : unicode_byte_type((p)[0], (p)[1]))
andrewbonney 0:07919e3d6c56 784 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
andrewbonney 0:07919e3d6c56 785 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
andrewbonney 0:07919e3d6c56 786 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
andrewbonney 0:07919e3d6c56 787 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
andrewbonney 0:07919e3d6c56 788 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
andrewbonney 0:07919e3d6c56 789 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
andrewbonney 0:07919e3d6c56 790
andrewbonney 0:07919e3d6c56 791 #ifdef XML_MIN_SIZE
andrewbonney 0:07919e3d6c56 792
andrewbonney 0:07919e3d6c56 793 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 794 big2_byteType(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 795 {
andrewbonney 0:07919e3d6c56 796 return BIG2_BYTE_TYPE(enc, p);
andrewbonney 0:07919e3d6c56 797 }
andrewbonney 0:07919e3d6c56 798
andrewbonney 0:07919e3d6c56 799 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 800 big2_byteToAscii(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 801 {
andrewbonney 0:07919e3d6c56 802 return BIG2_BYTE_TO_ASCII(enc, p);
andrewbonney 0:07919e3d6c56 803 }
andrewbonney 0:07919e3d6c56 804
andrewbonney 0:07919e3d6c56 805 static int PTRCALL
andrewbonney 0:07919e3d6c56 806 big2_charMatches(const ENCODING *enc, const char *p, int c)
andrewbonney 0:07919e3d6c56 807 {
andrewbonney 0:07919e3d6c56 808 return BIG2_CHAR_MATCHES(enc, p, c);
andrewbonney 0:07919e3d6c56 809 }
andrewbonney 0:07919e3d6c56 810
andrewbonney 0:07919e3d6c56 811 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 812 big2_isNameMin(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 813 {
andrewbonney 0:07919e3d6c56 814 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
andrewbonney 0:07919e3d6c56 815 }
andrewbonney 0:07919e3d6c56 816
andrewbonney 0:07919e3d6c56 817 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 818 big2_isNmstrtMin(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 819 {
andrewbonney 0:07919e3d6c56 820 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
andrewbonney 0:07919e3d6c56 821 }
andrewbonney 0:07919e3d6c56 822
andrewbonney 0:07919e3d6c56 823 #undef VTABLE
andrewbonney 0:07919e3d6c56 824 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
andrewbonney 0:07919e3d6c56 825
andrewbonney 0:07919e3d6c56 826 #else /* not XML_MIN_SIZE */
andrewbonney 0:07919e3d6c56 827
andrewbonney 0:07919e3d6c56 828 #undef PREFIX
andrewbonney 0:07919e3d6c56 829 #define PREFIX(ident) big2_ ## ident
andrewbonney 0:07919e3d6c56 830 #define MINBPC(enc) 2
andrewbonney 0:07919e3d6c56 831 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
andrewbonney 0:07919e3d6c56 832 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
andrewbonney 0:07919e3d6c56 833 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
andrewbonney 0:07919e3d6c56 834 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
andrewbonney 0:07919e3d6c56 835 #define IS_NAME_CHAR(enc, p, n) 0
andrewbonney 0:07919e3d6c56 836 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
andrewbonney 0:07919e3d6c56 837 #define IS_NMSTRT_CHAR(enc, p, n) (0)
andrewbonney 0:07919e3d6c56 838 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
andrewbonney 0:07919e3d6c56 839
andrewbonney 0:07919e3d6c56 840 #define XML_TOK_IMPL_C
andrewbonney 0:07919e3d6c56 841 #include "xmltok_impl.c"
andrewbonney 0:07919e3d6c56 842 #undef XML_TOK_IMPL_C
andrewbonney 0:07919e3d6c56 843
andrewbonney 0:07919e3d6c56 844 #undef MINBPC
andrewbonney 0:07919e3d6c56 845 #undef BYTE_TYPE
andrewbonney 0:07919e3d6c56 846 #undef BYTE_TO_ASCII
andrewbonney 0:07919e3d6c56 847 #undef CHAR_MATCHES
andrewbonney 0:07919e3d6c56 848 #undef IS_NAME_CHAR
andrewbonney 0:07919e3d6c56 849 #undef IS_NAME_CHAR_MINBPC
andrewbonney 0:07919e3d6c56 850 #undef IS_NMSTRT_CHAR
andrewbonney 0:07919e3d6c56 851 #undef IS_NMSTRT_CHAR_MINBPC
andrewbonney 0:07919e3d6c56 852 #undef IS_INVALID_CHAR
andrewbonney 0:07919e3d6c56 853
andrewbonney 0:07919e3d6c56 854 #endif /* not XML_MIN_SIZE */
andrewbonney 0:07919e3d6c56 855
andrewbonney 0:07919e3d6c56 856 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 857
andrewbonney 0:07919e3d6c56 858 static const struct normal_encoding big2_encoding_ns = {
andrewbonney 0:07919e3d6c56 859 { VTABLE, 2, 0,
andrewbonney 0:07919e3d6c56 860 #if BYTEORDER == 4321
andrewbonney 0:07919e3d6c56 861 1
andrewbonney 0:07919e3d6c56 862 #else
andrewbonney 0:07919e3d6c56 863 0
andrewbonney 0:07919e3d6c56 864 #endif
andrewbonney 0:07919e3d6c56 865 },
andrewbonney 0:07919e3d6c56 866 {
andrewbonney 0:07919e3d6c56 867 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 868 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 869 },
andrewbonney 0:07919e3d6c56 870 STANDARD_VTABLE(big2_)
andrewbonney 0:07919e3d6c56 871 };
andrewbonney 0:07919e3d6c56 872
andrewbonney 0:07919e3d6c56 873 #endif
andrewbonney 0:07919e3d6c56 874
andrewbonney 0:07919e3d6c56 875 static const struct normal_encoding big2_encoding = {
andrewbonney 0:07919e3d6c56 876 { VTABLE, 2, 0,
andrewbonney 0:07919e3d6c56 877 #if BYTEORDER == 4321
andrewbonney 0:07919e3d6c56 878 1
andrewbonney 0:07919e3d6c56 879 #else
andrewbonney 0:07919e3d6c56 880 0
andrewbonney 0:07919e3d6c56 881 #endif
andrewbonney 0:07919e3d6c56 882 },
andrewbonney 0:07919e3d6c56 883 {
andrewbonney 0:07919e3d6c56 884 #define BT_COLON BT_NMSTRT
andrewbonney 0:07919e3d6c56 885 #include "asciitab.h"
andrewbonney 0:07919e3d6c56 886 #undef BT_COLON
andrewbonney 0:07919e3d6c56 887 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 888 },
andrewbonney 0:07919e3d6c56 889 STANDARD_VTABLE(big2_)
andrewbonney 0:07919e3d6c56 890 };
andrewbonney 0:07919e3d6c56 891
andrewbonney 0:07919e3d6c56 892 #if BYTEORDER != 1234
andrewbonney 0:07919e3d6c56 893
andrewbonney 0:07919e3d6c56 894 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 895
andrewbonney 0:07919e3d6c56 896 static const struct normal_encoding internal_big2_encoding_ns = {
andrewbonney 0:07919e3d6c56 897 { VTABLE, 2, 0, 1 },
andrewbonney 0:07919e3d6c56 898 {
andrewbonney 0:07919e3d6c56 899 #include "iasciitab.h"
andrewbonney 0:07919e3d6c56 900 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 901 },
andrewbonney 0:07919e3d6c56 902 STANDARD_VTABLE(big2_)
andrewbonney 0:07919e3d6c56 903 };
andrewbonney 0:07919e3d6c56 904
andrewbonney 0:07919e3d6c56 905 #endif
andrewbonney 0:07919e3d6c56 906
andrewbonney 0:07919e3d6c56 907 static const struct normal_encoding internal_big2_encoding = {
andrewbonney 0:07919e3d6c56 908 { VTABLE, 2, 0, 1 },
andrewbonney 0:07919e3d6c56 909 {
andrewbonney 0:07919e3d6c56 910 #define BT_COLON BT_NMSTRT
andrewbonney 0:07919e3d6c56 911 #include "iasciitab.h"
andrewbonney 0:07919e3d6c56 912 #undef BT_COLON
andrewbonney 0:07919e3d6c56 913 #include "latin1tab.h"
andrewbonney 0:07919e3d6c56 914 },
andrewbonney 0:07919e3d6c56 915 STANDARD_VTABLE(big2_)
andrewbonney 0:07919e3d6c56 916 };
andrewbonney 0:07919e3d6c56 917
andrewbonney 0:07919e3d6c56 918 #endif
andrewbonney 0:07919e3d6c56 919
andrewbonney 0:07919e3d6c56 920 #undef PREFIX
andrewbonney 0:07919e3d6c56 921
andrewbonney 0:07919e3d6c56 922 static int FASTCALL
andrewbonney 0:07919e3d6c56 923 streqci(const char *s1, const char *s2)
andrewbonney 0:07919e3d6c56 924 {
andrewbonney 0:07919e3d6c56 925 for (;;) {
andrewbonney 0:07919e3d6c56 926 char c1 = *s1++;
andrewbonney 0:07919e3d6c56 927 char c2 = *s2++;
andrewbonney 0:07919e3d6c56 928 if (ASCII_a <= c1 && c1 <= ASCII_z)
andrewbonney 0:07919e3d6c56 929 c1 += ASCII_A - ASCII_a;
andrewbonney 0:07919e3d6c56 930 if (ASCII_a <= c2 && c2 <= ASCII_z)
andrewbonney 0:07919e3d6c56 931 c2 += ASCII_A - ASCII_a;
andrewbonney 0:07919e3d6c56 932 if (c1 != c2)
andrewbonney 0:07919e3d6c56 933 return 0;
andrewbonney 0:07919e3d6c56 934 if (!c1)
andrewbonney 0:07919e3d6c56 935 break;
andrewbonney 0:07919e3d6c56 936 }
andrewbonney 0:07919e3d6c56 937 return 1;
andrewbonney 0:07919e3d6c56 938 }
andrewbonney 0:07919e3d6c56 939
andrewbonney 0:07919e3d6c56 940 static void PTRCALL
andrewbonney 0:07919e3d6c56 941 initUpdatePosition(const ENCODING *enc, const char *ptr,
andrewbonney 0:07919e3d6c56 942 const char *end, POSITION *pos)
andrewbonney 0:07919e3d6c56 943 {
andrewbonney 0:07919e3d6c56 944 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
andrewbonney 0:07919e3d6c56 945 }
andrewbonney 0:07919e3d6c56 946
andrewbonney 0:07919e3d6c56 947 static int
andrewbonney 0:07919e3d6c56 948 toAscii(const ENCODING *enc, const char *ptr, const char *end)
andrewbonney 0:07919e3d6c56 949 {
andrewbonney 0:07919e3d6c56 950 char buf[1];
andrewbonney 0:07919e3d6c56 951 char *p = buf;
andrewbonney 0:07919e3d6c56 952 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
andrewbonney 0:07919e3d6c56 953 if (p == buf)
andrewbonney 0:07919e3d6c56 954 return -1;
andrewbonney 0:07919e3d6c56 955 else
andrewbonney 0:07919e3d6c56 956 return buf[0];
andrewbonney 0:07919e3d6c56 957 }
andrewbonney 0:07919e3d6c56 958
andrewbonney 0:07919e3d6c56 959 static int FASTCALL
andrewbonney 0:07919e3d6c56 960 isSpace(int c)
andrewbonney 0:07919e3d6c56 961 {
andrewbonney 0:07919e3d6c56 962 switch (c) {
andrewbonney 0:07919e3d6c56 963 case 0x20:
andrewbonney 0:07919e3d6c56 964 case 0xD:
andrewbonney 0:07919e3d6c56 965 case 0xA:
andrewbonney 0:07919e3d6c56 966 case 0x9:
andrewbonney 0:07919e3d6c56 967 return 1;
andrewbonney 0:07919e3d6c56 968 }
andrewbonney 0:07919e3d6c56 969 return 0;
andrewbonney 0:07919e3d6c56 970 }
andrewbonney 0:07919e3d6c56 971
andrewbonney 0:07919e3d6c56 972 /* Return 1 if there's just optional white space or there's an S
andrewbonney 0:07919e3d6c56 973 followed by name=val.
andrewbonney 0:07919e3d6c56 974 */
andrewbonney 0:07919e3d6c56 975 static int
andrewbonney 0:07919e3d6c56 976 parsePseudoAttribute(const ENCODING *enc,
andrewbonney 0:07919e3d6c56 977 const char *ptr,
andrewbonney 0:07919e3d6c56 978 const char *end,
andrewbonney 0:07919e3d6c56 979 const char **namePtr,
andrewbonney 0:07919e3d6c56 980 const char **nameEndPtr,
andrewbonney 0:07919e3d6c56 981 const char **valPtr,
andrewbonney 0:07919e3d6c56 982 const char **nextTokPtr)
andrewbonney 0:07919e3d6c56 983 {
andrewbonney 0:07919e3d6c56 984 int c;
andrewbonney 0:07919e3d6c56 985 char open;
andrewbonney 0:07919e3d6c56 986 if (ptr == end) {
andrewbonney 0:07919e3d6c56 987 *namePtr = NULL;
andrewbonney 0:07919e3d6c56 988 return 1;
andrewbonney 0:07919e3d6c56 989 }
andrewbonney 0:07919e3d6c56 990 if (!isSpace(toAscii(enc, ptr, end))) {
andrewbonney 0:07919e3d6c56 991 *nextTokPtr = ptr;
andrewbonney 0:07919e3d6c56 992 return 0;
andrewbonney 0:07919e3d6c56 993 }
andrewbonney 0:07919e3d6c56 994 do {
andrewbonney 0:07919e3d6c56 995 ptr += enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 996 } while (isSpace(toAscii(enc, ptr, end)));
andrewbonney 0:07919e3d6c56 997 if (ptr == end) {
andrewbonney 0:07919e3d6c56 998 *namePtr = NULL;
andrewbonney 0:07919e3d6c56 999 return 1;
andrewbonney 0:07919e3d6c56 1000 }
andrewbonney 0:07919e3d6c56 1001 *namePtr = ptr;
andrewbonney 0:07919e3d6c56 1002 for (;;) {
andrewbonney 0:07919e3d6c56 1003 c = toAscii(enc, ptr, end);
andrewbonney 0:07919e3d6c56 1004 if (c == -1) {
andrewbonney 0:07919e3d6c56 1005 *nextTokPtr = ptr;
andrewbonney 0:07919e3d6c56 1006 return 0;
andrewbonney 0:07919e3d6c56 1007 }
andrewbonney 0:07919e3d6c56 1008 if (c == ASCII_EQUALS) {
andrewbonney 0:07919e3d6c56 1009 *nameEndPtr = ptr;
andrewbonney 0:07919e3d6c56 1010 break;
andrewbonney 0:07919e3d6c56 1011 }
andrewbonney 0:07919e3d6c56 1012 if (isSpace(c)) {
andrewbonney 0:07919e3d6c56 1013 *nameEndPtr = ptr;
andrewbonney 0:07919e3d6c56 1014 do {
andrewbonney 0:07919e3d6c56 1015 ptr += enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1016 } while (isSpace(c = toAscii(enc, ptr, end)));
andrewbonney 0:07919e3d6c56 1017 if (c != ASCII_EQUALS) {
andrewbonney 0:07919e3d6c56 1018 *nextTokPtr = ptr;
andrewbonney 0:07919e3d6c56 1019 return 0;
andrewbonney 0:07919e3d6c56 1020 }
andrewbonney 0:07919e3d6c56 1021 break;
andrewbonney 0:07919e3d6c56 1022 }
andrewbonney 0:07919e3d6c56 1023 ptr += enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1024 }
andrewbonney 0:07919e3d6c56 1025 if (ptr == *namePtr) {
andrewbonney 0:07919e3d6c56 1026 *nextTokPtr = ptr;
andrewbonney 0:07919e3d6c56 1027 return 0;
andrewbonney 0:07919e3d6c56 1028 }
andrewbonney 0:07919e3d6c56 1029 ptr += enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1030 c = toAscii(enc, ptr, end);
andrewbonney 0:07919e3d6c56 1031 while (isSpace(c)) {
andrewbonney 0:07919e3d6c56 1032 ptr += enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1033 c = toAscii(enc, ptr, end);
andrewbonney 0:07919e3d6c56 1034 }
andrewbonney 0:07919e3d6c56 1035 if (c != ASCII_QUOT && c != ASCII_APOS) {
andrewbonney 0:07919e3d6c56 1036 *nextTokPtr = ptr;
andrewbonney 0:07919e3d6c56 1037 return 0;
andrewbonney 0:07919e3d6c56 1038 }
andrewbonney 0:07919e3d6c56 1039 open = (char)c;
andrewbonney 0:07919e3d6c56 1040 ptr += enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1041 *valPtr = ptr;
andrewbonney 0:07919e3d6c56 1042 for (;; ptr += enc->minBytesPerChar) {
andrewbonney 0:07919e3d6c56 1043 c = toAscii(enc, ptr, end);
andrewbonney 0:07919e3d6c56 1044 if (c == open)
andrewbonney 0:07919e3d6c56 1045 break;
andrewbonney 0:07919e3d6c56 1046 if (!(ASCII_a <= c && c <= ASCII_z)
andrewbonney 0:07919e3d6c56 1047 && !(ASCII_A <= c && c <= ASCII_Z)
andrewbonney 0:07919e3d6c56 1048 && !(ASCII_0 <= c && c <= ASCII_9)
andrewbonney 0:07919e3d6c56 1049 && c != ASCII_PERIOD
andrewbonney 0:07919e3d6c56 1050 && c != ASCII_MINUS
andrewbonney 0:07919e3d6c56 1051 && c != ASCII_UNDERSCORE) {
andrewbonney 0:07919e3d6c56 1052 *nextTokPtr = ptr;
andrewbonney 0:07919e3d6c56 1053 return 0;
andrewbonney 0:07919e3d6c56 1054 }
andrewbonney 0:07919e3d6c56 1055 }
andrewbonney 0:07919e3d6c56 1056 *nextTokPtr = ptr + enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1057 return 1;
andrewbonney 0:07919e3d6c56 1058 }
andrewbonney 0:07919e3d6c56 1059
andrewbonney 0:07919e3d6c56 1060 static const char KW_version[] = {
andrewbonney 0:07919e3d6c56 1061 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
andrewbonney 0:07919e3d6c56 1062 };
andrewbonney 0:07919e3d6c56 1063
andrewbonney 0:07919e3d6c56 1064 static const char KW_encoding[] = {
andrewbonney 0:07919e3d6c56 1065 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
andrewbonney 0:07919e3d6c56 1066 };
andrewbonney 0:07919e3d6c56 1067
andrewbonney 0:07919e3d6c56 1068 static const char KW_standalone[] = {
andrewbonney 0:07919e3d6c56 1069 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
andrewbonney 0:07919e3d6c56 1070 ASCII_n, ASCII_e, '\0'
andrewbonney 0:07919e3d6c56 1071 };
andrewbonney 0:07919e3d6c56 1072
andrewbonney 0:07919e3d6c56 1073 static const char KW_yes[] = {
andrewbonney 0:07919e3d6c56 1074 ASCII_y, ASCII_e, ASCII_s, '\0'
andrewbonney 0:07919e3d6c56 1075 };
andrewbonney 0:07919e3d6c56 1076
andrewbonney 0:07919e3d6c56 1077 static const char KW_no[] = {
andrewbonney 0:07919e3d6c56 1078 ASCII_n, ASCII_o, '\0'
andrewbonney 0:07919e3d6c56 1079 };
andrewbonney 0:07919e3d6c56 1080
andrewbonney 0:07919e3d6c56 1081 static int
andrewbonney 0:07919e3d6c56 1082 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
andrewbonney 0:07919e3d6c56 1083 const char *,
andrewbonney 0:07919e3d6c56 1084 const char *),
andrewbonney 0:07919e3d6c56 1085 int isGeneralTextEntity,
andrewbonney 0:07919e3d6c56 1086 const ENCODING *enc,
andrewbonney 0:07919e3d6c56 1087 const char *ptr,
andrewbonney 0:07919e3d6c56 1088 const char *end,
andrewbonney 0:07919e3d6c56 1089 const char **badPtr,
andrewbonney 0:07919e3d6c56 1090 const char **versionPtr,
andrewbonney 0:07919e3d6c56 1091 const char **versionEndPtr,
andrewbonney 0:07919e3d6c56 1092 const char **encodingName,
andrewbonney 0:07919e3d6c56 1093 const ENCODING **encoding,
andrewbonney 0:07919e3d6c56 1094 int *standalone)
andrewbonney 0:07919e3d6c56 1095 {
andrewbonney 0:07919e3d6c56 1096 const char *val = NULL;
andrewbonney 0:07919e3d6c56 1097 const char *name = NULL;
andrewbonney 0:07919e3d6c56 1098 const char *nameEnd = NULL;
andrewbonney 0:07919e3d6c56 1099 ptr += 5 * enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1100 end -= 2 * enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1101 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
andrewbonney 0:07919e3d6c56 1102 || !name) {
andrewbonney 0:07919e3d6c56 1103 *badPtr = ptr;
andrewbonney 0:07919e3d6c56 1104 return 0;
andrewbonney 0:07919e3d6c56 1105 }
andrewbonney 0:07919e3d6c56 1106 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
andrewbonney 0:07919e3d6c56 1107 if (!isGeneralTextEntity) {
andrewbonney 0:07919e3d6c56 1108 *badPtr = name;
andrewbonney 0:07919e3d6c56 1109 return 0;
andrewbonney 0:07919e3d6c56 1110 }
andrewbonney 0:07919e3d6c56 1111 }
andrewbonney 0:07919e3d6c56 1112 else {
andrewbonney 0:07919e3d6c56 1113 if (versionPtr)
andrewbonney 0:07919e3d6c56 1114 *versionPtr = val;
andrewbonney 0:07919e3d6c56 1115 if (versionEndPtr)
andrewbonney 0:07919e3d6c56 1116 *versionEndPtr = ptr;
andrewbonney 0:07919e3d6c56 1117 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
andrewbonney 0:07919e3d6c56 1118 *badPtr = ptr;
andrewbonney 0:07919e3d6c56 1119 return 0;
andrewbonney 0:07919e3d6c56 1120 }
andrewbonney 0:07919e3d6c56 1121 if (!name) {
andrewbonney 0:07919e3d6c56 1122 if (isGeneralTextEntity) {
andrewbonney 0:07919e3d6c56 1123 /* a TextDecl must have an EncodingDecl */
andrewbonney 0:07919e3d6c56 1124 *badPtr = ptr;
andrewbonney 0:07919e3d6c56 1125 return 0;
andrewbonney 0:07919e3d6c56 1126 }
andrewbonney 0:07919e3d6c56 1127 return 1;
andrewbonney 0:07919e3d6c56 1128 }
andrewbonney 0:07919e3d6c56 1129 }
andrewbonney 0:07919e3d6c56 1130 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
andrewbonney 0:07919e3d6c56 1131 int c = toAscii(enc, val, end);
andrewbonney 0:07919e3d6c56 1132 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
andrewbonney 0:07919e3d6c56 1133 *badPtr = val;
andrewbonney 0:07919e3d6c56 1134 return 0;
andrewbonney 0:07919e3d6c56 1135 }
andrewbonney 0:07919e3d6c56 1136 if (encodingName)
andrewbonney 0:07919e3d6c56 1137 *encodingName = val;
andrewbonney 0:07919e3d6c56 1138 if (encoding)
andrewbonney 0:07919e3d6c56 1139 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
andrewbonney 0:07919e3d6c56 1140 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
andrewbonney 0:07919e3d6c56 1141 *badPtr = ptr;
andrewbonney 0:07919e3d6c56 1142 return 0;
andrewbonney 0:07919e3d6c56 1143 }
andrewbonney 0:07919e3d6c56 1144 if (!name)
andrewbonney 0:07919e3d6c56 1145 return 1;
andrewbonney 0:07919e3d6c56 1146 }
andrewbonney 0:07919e3d6c56 1147 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
andrewbonney 0:07919e3d6c56 1148 || isGeneralTextEntity) {
andrewbonney 0:07919e3d6c56 1149 *badPtr = name;
andrewbonney 0:07919e3d6c56 1150 return 0;
andrewbonney 0:07919e3d6c56 1151 }
andrewbonney 0:07919e3d6c56 1152 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
andrewbonney 0:07919e3d6c56 1153 if (standalone)
andrewbonney 0:07919e3d6c56 1154 *standalone = 1;
andrewbonney 0:07919e3d6c56 1155 }
andrewbonney 0:07919e3d6c56 1156 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
andrewbonney 0:07919e3d6c56 1157 if (standalone)
andrewbonney 0:07919e3d6c56 1158 *standalone = 0;
andrewbonney 0:07919e3d6c56 1159 }
andrewbonney 0:07919e3d6c56 1160 else {
andrewbonney 0:07919e3d6c56 1161 *badPtr = val;
andrewbonney 0:07919e3d6c56 1162 return 0;
andrewbonney 0:07919e3d6c56 1163 }
andrewbonney 0:07919e3d6c56 1164 while (isSpace(toAscii(enc, ptr, end)))
andrewbonney 0:07919e3d6c56 1165 ptr += enc->minBytesPerChar;
andrewbonney 0:07919e3d6c56 1166 if (ptr != end) {
andrewbonney 0:07919e3d6c56 1167 *badPtr = ptr;
andrewbonney 0:07919e3d6c56 1168 return 0;
andrewbonney 0:07919e3d6c56 1169 }
andrewbonney 0:07919e3d6c56 1170 return 1;
andrewbonney 0:07919e3d6c56 1171 }
andrewbonney 0:07919e3d6c56 1172
andrewbonney 0:07919e3d6c56 1173 static int FASTCALL
andrewbonney 0:07919e3d6c56 1174 checkCharRefNumber(int result)
andrewbonney 0:07919e3d6c56 1175 {
andrewbonney 0:07919e3d6c56 1176 switch (result >> 8) {
andrewbonney 0:07919e3d6c56 1177 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
andrewbonney 0:07919e3d6c56 1178 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
andrewbonney 0:07919e3d6c56 1179 return -1;
andrewbonney 0:07919e3d6c56 1180 case 0:
andrewbonney 0:07919e3d6c56 1181 if (latin1_encoding.type[result] == BT_NONXML)
andrewbonney 0:07919e3d6c56 1182 return -1;
andrewbonney 0:07919e3d6c56 1183 break;
andrewbonney 0:07919e3d6c56 1184 case 0xFF:
andrewbonney 0:07919e3d6c56 1185 if (result == 0xFFFE || result == 0xFFFF)
andrewbonney 0:07919e3d6c56 1186 return -1;
andrewbonney 0:07919e3d6c56 1187 break;
andrewbonney 0:07919e3d6c56 1188 }
andrewbonney 0:07919e3d6c56 1189 return result;
andrewbonney 0:07919e3d6c56 1190 }
andrewbonney 0:07919e3d6c56 1191
andrewbonney 0:07919e3d6c56 1192 int FASTCALL
andrewbonney 0:07919e3d6c56 1193 XmlUtf8Encode(int c, char *buf)
andrewbonney 0:07919e3d6c56 1194 {
andrewbonney 0:07919e3d6c56 1195 enum {
andrewbonney 0:07919e3d6c56 1196 /* minN is minimum legal resulting value for N byte sequence */
andrewbonney 0:07919e3d6c56 1197 min2 = 0x80,
andrewbonney 0:07919e3d6c56 1198 min3 = 0x800,
andrewbonney 0:07919e3d6c56 1199 min4 = 0x10000
andrewbonney 0:07919e3d6c56 1200 };
andrewbonney 0:07919e3d6c56 1201
andrewbonney 0:07919e3d6c56 1202 if (c < 0)
andrewbonney 0:07919e3d6c56 1203 return 0;
andrewbonney 0:07919e3d6c56 1204 if (c < min2) {
andrewbonney 0:07919e3d6c56 1205 buf[0] = (char)(c | UTF8_cval1);
andrewbonney 0:07919e3d6c56 1206 return 1;
andrewbonney 0:07919e3d6c56 1207 }
andrewbonney 0:07919e3d6c56 1208 if (c < min3) {
andrewbonney 0:07919e3d6c56 1209 buf[0] = (char)((c >> 6) | UTF8_cval2);
andrewbonney 0:07919e3d6c56 1210 buf[1] = (char)((c & 0x3f) | 0x80);
andrewbonney 0:07919e3d6c56 1211 return 2;
andrewbonney 0:07919e3d6c56 1212 }
andrewbonney 0:07919e3d6c56 1213 if (c < min4) {
andrewbonney 0:07919e3d6c56 1214 buf[0] = (char)((c >> 12) | UTF8_cval3);
andrewbonney 0:07919e3d6c56 1215 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
andrewbonney 0:07919e3d6c56 1216 buf[2] = (char)((c & 0x3f) | 0x80);
andrewbonney 0:07919e3d6c56 1217 return 3;
andrewbonney 0:07919e3d6c56 1218 }
andrewbonney 0:07919e3d6c56 1219 if (c < 0x110000) {
andrewbonney 0:07919e3d6c56 1220 buf[0] = (char)((c >> 18) | UTF8_cval4);
andrewbonney 0:07919e3d6c56 1221 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
andrewbonney 0:07919e3d6c56 1222 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
andrewbonney 0:07919e3d6c56 1223 buf[3] = (char)((c & 0x3f) | 0x80);
andrewbonney 0:07919e3d6c56 1224 return 4;
andrewbonney 0:07919e3d6c56 1225 }
andrewbonney 0:07919e3d6c56 1226 return 0;
andrewbonney 0:07919e3d6c56 1227 }
andrewbonney 0:07919e3d6c56 1228
andrewbonney 0:07919e3d6c56 1229 int FASTCALL
andrewbonney 0:07919e3d6c56 1230 XmlUtf16Encode(int charNum, unsigned short *buf)
andrewbonney 0:07919e3d6c56 1231 {
andrewbonney 0:07919e3d6c56 1232 if (charNum < 0)
andrewbonney 0:07919e3d6c56 1233 return 0;
andrewbonney 0:07919e3d6c56 1234 if (charNum < 0x10000) {
andrewbonney 0:07919e3d6c56 1235 buf[0] = (unsigned short)charNum;
andrewbonney 0:07919e3d6c56 1236 return 1;
andrewbonney 0:07919e3d6c56 1237 }
andrewbonney 0:07919e3d6c56 1238 if (charNum < 0x110000) {
andrewbonney 0:07919e3d6c56 1239 charNum -= 0x10000;
andrewbonney 0:07919e3d6c56 1240 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
andrewbonney 0:07919e3d6c56 1241 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
andrewbonney 0:07919e3d6c56 1242 return 2;
andrewbonney 0:07919e3d6c56 1243 }
andrewbonney 0:07919e3d6c56 1244 return 0;
andrewbonney 0:07919e3d6c56 1245 }
andrewbonney 0:07919e3d6c56 1246
andrewbonney 0:07919e3d6c56 1247 struct unknown_encoding {
andrewbonney 0:07919e3d6c56 1248 struct normal_encoding normal;
andrewbonney 0:07919e3d6c56 1249 CONVERTER convert;
andrewbonney 0:07919e3d6c56 1250 void *userData;
andrewbonney 0:07919e3d6c56 1251 unsigned short utf16[256];
andrewbonney 0:07919e3d6c56 1252 char utf8[256][4];
andrewbonney 0:07919e3d6c56 1253 };
andrewbonney 0:07919e3d6c56 1254
andrewbonney 0:07919e3d6c56 1255 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
andrewbonney 0:07919e3d6c56 1256
andrewbonney 0:07919e3d6c56 1257 int
andrewbonney 0:07919e3d6c56 1258 XmlSizeOfUnknownEncoding(void)
andrewbonney 0:07919e3d6c56 1259 {
andrewbonney 0:07919e3d6c56 1260 return sizeof(struct unknown_encoding);
andrewbonney 0:07919e3d6c56 1261 }
andrewbonney 0:07919e3d6c56 1262
andrewbonney 0:07919e3d6c56 1263 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 1264 unknown_isName(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 1265 {
andrewbonney 0:07919e3d6c56 1266 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
andrewbonney 0:07919e3d6c56 1267 int c = uenc->convert(uenc->userData, p);
andrewbonney 0:07919e3d6c56 1268 if (c & ~0xFFFF)
andrewbonney 0:07919e3d6c56 1269 return 0;
andrewbonney 0:07919e3d6c56 1270 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
andrewbonney 0:07919e3d6c56 1271 }
andrewbonney 0:07919e3d6c56 1272
andrewbonney 0:07919e3d6c56 1273 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 1274 unknown_isNmstrt(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 1275 {
andrewbonney 0:07919e3d6c56 1276 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
andrewbonney 0:07919e3d6c56 1277 int c = uenc->convert(uenc->userData, p);
andrewbonney 0:07919e3d6c56 1278 if (c & ~0xFFFF)
andrewbonney 0:07919e3d6c56 1279 return 0;
andrewbonney 0:07919e3d6c56 1280 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
andrewbonney 0:07919e3d6c56 1281 }
andrewbonney 0:07919e3d6c56 1282
andrewbonney 0:07919e3d6c56 1283 static int PTRFASTCALL
andrewbonney 0:07919e3d6c56 1284 unknown_isInvalid(const ENCODING *enc, const char *p)
andrewbonney 0:07919e3d6c56 1285 {
andrewbonney 0:07919e3d6c56 1286 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
andrewbonney 0:07919e3d6c56 1287 int c = uenc->convert(uenc->userData, p);
andrewbonney 0:07919e3d6c56 1288 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
andrewbonney 0:07919e3d6c56 1289 }
andrewbonney 0:07919e3d6c56 1290
andrewbonney 0:07919e3d6c56 1291 static void PTRCALL
andrewbonney 0:07919e3d6c56 1292 unknown_toUtf8(const ENCODING *enc,
andrewbonney 0:07919e3d6c56 1293 const char **fromP, const char *fromLim,
andrewbonney 0:07919e3d6c56 1294 char **toP, const char *toLim)
andrewbonney 0:07919e3d6c56 1295 {
andrewbonney 0:07919e3d6c56 1296 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
andrewbonney 0:07919e3d6c56 1297 char buf[XML_UTF8_ENCODE_MAX];
andrewbonney 0:07919e3d6c56 1298 for (;;) {
andrewbonney 0:07919e3d6c56 1299 const char *utf8;
andrewbonney 0:07919e3d6c56 1300 int n;
andrewbonney 0:07919e3d6c56 1301 if (*fromP == fromLim)
andrewbonney 0:07919e3d6c56 1302 break;
andrewbonney 0:07919e3d6c56 1303 utf8 = uenc->utf8[(unsigned char)**fromP];
andrewbonney 0:07919e3d6c56 1304 n = *utf8++;
andrewbonney 0:07919e3d6c56 1305 if (n == 0) {
andrewbonney 0:07919e3d6c56 1306 int c = uenc->convert(uenc->userData, *fromP);
andrewbonney 0:07919e3d6c56 1307 n = XmlUtf8Encode(c, buf);
andrewbonney 0:07919e3d6c56 1308 if (n > toLim - *toP)
andrewbonney 0:07919e3d6c56 1309 break;
andrewbonney 0:07919e3d6c56 1310 utf8 = buf;
andrewbonney 0:07919e3d6c56 1311 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
andrewbonney 0:07919e3d6c56 1312 - (BT_LEAD2 - 2));
andrewbonney 0:07919e3d6c56 1313 }
andrewbonney 0:07919e3d6c56 1314 else {
andrewbonney 0:07919e3d6c56 1315 if (n > toLim - *toP)
andrewbonney 0:07919e3d6c56 1316 break;
andrewbonney 0:07919e3d6c56 1317 (*fromP)++;
andrewbonney 0:07919e3d6c56 1318 }
andrewbonney 0:07919e3d6c56 1319 do {
andrewbonney 0:07919e3d6c56 1320 *(*toP)++ = *utf8++;
andrewbonney 0:07919e3d6c56 1321 } while (--n != 0);
andrewbonney 0:07919e3d6c56 1322 }
andrewbonney 0:07919e3d6c56 1323 }
andrewbonney 0:07919e3d6c56 1324
andrewbonney 0:07919e3d6c56 1325 static void PTRCALL
andrewbonney 0:07919e3d6c56 1326 unknown_toUtf16(const ENCODING *enc,
andrewbonney 0:07919e3d6c56 1327 const char **fromP, const char *fromLim,
andrewbonney 0:07919e3d6c56 1328 unsigned short **toP, const unsigned short *toLim)
andrewbonney 0:07919e3d6c56 1329 {
andrewbonney 0:07919e3d6c56 1330 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
andrewbonney 0:07919e3d6c56 1331 while (*fromP != fromLim && *toP != toLim) {
andrewbonney 0:07919e3d6c56 1332 unsigned short c = uenc->utf16[(unsigned char)**fromP];
andrewbonney 0:07919e3d6c56 1333 if (c == 0) {
andrewbonney 0:07919e3d6c56 1334 c = (unsigned short)
andrewbonney 0:07919e3d6c56 1335 uenc->convert(uenc->userData, *fromP);
andrewbonney 0:07919e3d6c56 1336 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
andrewbonney 0:07919e3d6c56 1337 - (BT_LEAD2 - 2));
andrewbonney 0:07919e3d6c56 1338 }
andrewbonney 0:07919e3d6c56 1339 else
andrewbonney 0:07919e3d6c56 1340 (*fromP)++;
andrewbonney 0:07919e3d6c56 1341 *(*toP)++ = c;
andrewbonney 0:07919e3d6c56 1342 }
andrewbonney 0:07919e3d6c56 1343 }
andrewbonney 0:07919e3d6c56 1344
andrewbonney 0:07919e3d6c56 1345 ENCODING *
andrewbonney 0:07919e3d6c56 1346 XmlInitUnknownEncoding(void *mem,
andrewbonney 0:07919e3d6c56 1347 int *table,
andrewbonney 0:07919e3d6c56 1348 CONVERTER convert,
andrewbonney 0:07919e3d6c56 1349 void *userData)
andrewbonney 0:07919e3d6c56 1350 {
andrewbonney 0:07919e3d6c56 1351 int i;
andrewbonney 0:07919e3d6c56 1352 struct unknown_encoding *e = (struct unknown_encoding *)mem;
andrewbonney 0:07919e3d6c56 1353 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
andrewbonney 0:07919e3d6c56 1354 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
andrewbonney 0:07919e3d6c56 1355 for (i = 0; i < 128; i++)
andrewbonney 0:07919e3d6c56 1356 if (latin1_encoding.type[i] != BT_OTHER
andrewbonney 0:07919e3d6c56 1357 && latin1_encoding.type[i] != BT_NONXML
andrewbonney 0:07919e3d6c56 1358 && table[i] != i)
andrewbonney 0:07919e3d6c56 1359 return 0;
andrewbonney 0:07919e3d6c56 1360 for (i = 0; i < 256; i++) {
andrewbonney 0:07919e3d6c56 1361 int c = table[i];
andrewbonney 0:07919e3d6c56 1362 if (c == -1) {
andrewbonney 0:07919e3d6c56 1363 e->normal.type[i] = BT_MALFORM;
andrewbonney 0:07919e3d6c56 1364 /* This shouldn't really get used. */
andrewbonney 0:07919e3d6c56 1365 e->utf16[i] = 0xFFFF;
andrewbonney 0:07919e3d6c56 1366 e->utf8[i][0] = 1;
andrewbonney 0:07919e3d6c56 1367 e->utf8[i][1] = 0;
andrewbonney 0:07919e3d6c56 1368 }
andrewbonney 0:07919e3d6c56 1369 else if (c < 0) {
andrewbonney 0:07919e3d6c56 1370 if (c < -4)
andrewbonney 0:07919e3d6c56 1371 return 0;
andrewbonney 0:07919e3d6c56 1372 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
andrewbonney 0:07919e3d6c56 1373 e->utf8[i][0] = 0;
andrewbonney 0:07919e3d6c56 1374 e->utf16[i] = 0;
andrewbonney 0:07919e3d6c56 1375 }
andrewbonney 0:07919e3d6c56 1376 else if (c < 0x80) {
andrewbonney 0:07919e3d6c56 1377 if (latin1_encoding.type[c] != BT_OTHER
andrewbonney 0:07919e3d6c56 1378 && latin1_encoding.type[c] != BT_NONXML
andrewbonney 0:07919e3d6c56 1379 && c != i)
andrewbonney 0:07919e3d6c56 1380 return 0;
andrewbonney 0:07919e3d6c56 1381 e->normal.type[i] = latin1_encoding.type[c];
andrewbonney 0:07919e3d6c56 1382 e->utf8[i][0] = 1;
andrewbonney 0:07919e3d6c56 1383 e->utf8[i][1] = (char)c;
andrewbonney 0:07919e3d6c56 1384 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
andrewbonney 0:07919e3d6c56 1385 }
andrewbonney 0:07919e3d6c56 1386 else if (checkCharRefNumber(c) < 0) {
andrewbonney 0:07919e3d6c56 1387 e->normal.type[i] = BT_NONXML;
andrewbonney 0:07919e3d6c56 1388 /* This shouldn't really get used. */
andrewbonney 0:07919e3d6c56 1389 e->utf16[i] = 0xFFFF;
andrewbonney 0:07919e3d6c56 1390 e->utf8[i][0] = 1;
andrewbonney 0:07919e3d6c56 1391 e->utf8[i][1] = 0;
andrewbonney 0:07919e3d6c56 1392 }
andrewbonney 0:07919e3d6c56 1393 else {
andrewbonney 0:07919e3d6c56 1394 if (c > 0xFFFF)
andrewbonney 0:07919e3d6c56 1395 return 0;
andrewbonney 0:07919e3d6c56 1396 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
andrewbonney 0:07919e3d6c56 1397 e->normal.type[i] = BT_NMSTRT;
andrewbonney 0:07919e3d6c56 1398 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
andrewbonney 0:07919e3d6c56 1399 e->normal.type[i] = BT_NAME;
andrewbonney 0:07919e3d6c56 1400 else
andrewbonney 0:07919e3d6c56 1401 e->normal.type[i] = BT_OTHER;
andrewbonney 0:07919e3d6c56 1402 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
andrewbonney 0:07919e3d6c56 1403 e->utf16[i] = (unsigned short)c;
andrewbonney 0:07919e3d6c56 1404 }
andrewbonney 0:07919e3d6c56 1405 }
andrewbonney 0:07919e3d6c56 1406 e->userData = userData;
andrewbonney 0:07919e3d6c56 1407 e->convert = convert;
andrewbonney 0:07919e3d6c56 1408 if (convert) {
andrewbonney 0:07919e3d6c56 1409 e->normal.isName2 = unknown_isName;
andrewbonney 0:07919e3d6c56 1410 e->normal.isName3 = unknown_isName;
andrewbonney 0:07919e3d6c56 1411 e->normal.isName4 = unknown_isName;
andrewbonney 0:07919e3d6c56 1412 e->normal.isNmstrt2 = unknown_isNmstrt;
andrewbonney 0:07919e3d6c56 1413 e->normal.isNmstrt3 = unknown_isNmstrt;
andrewbonney 0:07919e3d6c56 1414 e->normal.isNmstrt4 = unknown_isNmstrt;
andrewbonney 0:07919e3d6c56 1415 e->normal.isInvalid2 = unknown_isInvalid;
andrewbonney 0:07919e3d6c56 1416 e->normal.isInvalid3 = unknown_isInvalid;
andrewbonney 0:07919e3d6c56 1417 e->normal.isInvalid4 = unknown_isInvalid;
andrewbonney 0:07919e3d6c56 1418 }
andrewbonney 0:07919e3d6c56 1419 e->normal.enc.utf8Convert = unknown_toUtf8;
andrewbonney 0:07919e3d6c56 1420 e->normal.enc.utf16Convert = unknown_toUtf16;
andrewbonney 0:07919e3d6c56 1421 return &(e->normal.enc);
andrewbonney 0:07919e3d6c56 1422 }
andrewbonney 0:07919e3d6c56 1423
andrewbonney 0:07919e3d6c56 1424 /* If this enumeration is changed, getEncodingIndex and encodings
andrewbonney 0:07919e3d6c56 1425 must also be changed. */
andrewbonney 0:07919e3d6c56 1426 enum {
andrewbonney 0:07919e3d6c56 1427 UNKNOWN_ENC = -1,
andrewbonney 0:07919e3d6c56 1428 ISO_8859_1_ENC = 0,
andrewbonney 0:07919e3d6c56 1429 US_ASCII_ENC,
andrewbonney 0:07919e3d6c56 1430 UTF_8_ENC,
andrewbonney 0:07919e3d6c56 1431 UTF_16_ENC,
andrewbonney 0:07919e3d6c56 1432 UTF_16BE_ENC,
andrewbonney 0:07919e3d6c56 1433 UTF_16LE_ENC,
andrewbonney 0:07919e3d6c56 1434 /* must match encodingNames up to here */
andrewbonney 0:07919e3d6c56 1435 NO_ENC
andrewbonney 0:07919e3d6c56 1436 };
andrewbonney 0:07919e3d6c56 1437
andrewbonney 0:07919e3d6c56 1438 static const char KW_ISO_8859_1[] = {
andrewbonney 0:07919e3d6c56 1439 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
andrewbonney 0:07919e3d6c56 1440 ASCII_MINUS, ASCII_1, '\0'
andrewbonney 0:07919e3d6c56 1441 };
andrewbonney 0:07919e3d6c56 1442 static const char KW_US_ASCII[] = {
andrewbonney 0:07919e3d6c56 1443 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
andrewbonney 0:07919e3d6c56 1444 '\0'
andrewbonney 0:07919e3d6c56 1445 };
andrewbonney 0:07919e3d6c56 1446 static const char KW_UTF_8[] = {
andrewbonney 0:07919e3d6c56 1447 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
andrewbonney 0:07919e3d6c56 1448 };
andrewbonney 0:07919e3d6c56 1449 static const char KW_UTF_16[] = {
andrewbonney 0:07919e3d6c56 1450 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
andrewbonney 0:07919e3d6c56 1451 };
andrewbonney 0:07919e3d6c56 1452 static const char KW_UTF_16BE[] = {
andrewbonney 0:07919e3d6c56 1453 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
andrewbonney 0:07919e3d6c56 1454 '\0'
andrewbonney 0:07919e3d6c56 1455 };
andrewbonney 0:07919e3d6c56 1456 static const char KW_UTF_16LE[] = {
andrewbonney 0:07919e3d6c56 1457 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
andrewbonney 0:07919e3d6c56 1458 '\0'
andrewbonney 0:07919e3d6c56 1459 };
andrewbonney 0:07919e3d6c56 1460
andrewbonney 0:07919e3d6c56 1461 static int FASTCALL
andrewbonney 0:07919e3d6c56 1462 getEncodingIndex(const char *name)
andrewbonney 0:07919e3d6c56 1463 {
andrewbonney 0:07919e3d6c56 1464 static const char * const encodingNames[] = {
andrewbonney 0:07919e3d6c56 1465 KW_ISO_8859_1,
andrewbonney 0:07919e3d6c56 1466 KW_US_ASCII,
andrewbonney 0:07919e3d6c56 1467 KW_UTF_8,
andrewbonney 0:07919e3d6c56 1468 KW_UTF_16,
andrewbonney 0:07919e3d6c56 1469 KW_UTF_16BE,
andrewbonney 0:07919e3d6c56 1470 KW_UTF_16LE,
andrewbonney 0:07919e3d6c56 1471 };
andrewbonney 0:07919e3d6c56 1472 int i;
andrewbonney 0:07919e3d6c56 1473 if (name == NULL)
andrewbonney 0:07919e3d6c56 1474 return NO_ENC;
andrewbonney 0:07919e3d6c56 1475 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
andrewbonney 0:07919e3d6c56 1476 if (streqci(name, encodingNames[i]))
andrewbonney 0:07919e3d6c56 1477 return i;
andrewbonney 0:07919e3d6c56 1478 return UNKNOWN_ENC;
andrewbonney 0:07919e3d6c56 1479 }
andrewbonney 0:07919e3d6c56 1480
andrewbonney 0:07919e3d6c56 1481 /* For binary compatibility, we store the index of the encoding
andrewbonney 0:07919e3d6c56 1482 specified at initialization in the isUtf16 member.
andrewbonney 0:07919e3d6c56 1483 */
andrewbonney 0:07919e3d6c56 1484
andrewbonney 0:07919e3d6c56 1485 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
andrewbonney 0:07919e3d6c56 1486 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
andrewbonney 0:07919e3d6c56 1487
andrewbonney 0:07919e3d6c56 1488 /* This is what detects the encoding. encodingTable maps from
andrewbonney 0:07919e3d6c56 1489 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
andrewbonney 0:07919e3d6c56 1490 the external (protocol) specified encoding; state is
andrewbonney 0:07919e3d6c56 1491 XML_CONTENT_STATE if we're parsing an external text entity, and
andrewbonney 0:07919e3d6c56 1492 XML_PROLOG_STATE otherwise.
andrewbonney 0:07919e3d6c56 1493 */
andrewbonney 0:07919e3d6c56 1494
andrewbonney 0:07919e3d6c56 1495
andrewbonney 0:07919e3d6c56 1496 static int
andrewbonney 0:07919e3d6c56 1497 initScan(const ENCODING * const *encodingTable,
andrewbonney 0:07919e3d6c56 1498 const INIT_ENCODING *enc,
andrewbonney 0:07919e3d6c56 1499 int state,
andrewbonney 0:07919e3d6c56 1500 const char *ptr,
andrewbonney 0:07919e3d6c56 1501 const char *end,
andrewbonney 0:07919e3d6c56 1502 const char **nextTokPtr)
andrewbonney 0:07919e3d6c56 1503 {
andrewbonney 0:07919e3d6c56 1504 const ENCODING **encPtr;
andrewbonney 0:07919e3d6c56 1505
andrewbonney 0:07919e3d6c56 1506 if (ptr == end)
andrewbonney 0:07919e3d6c56 1507 return XML_TOK_NONE;
andrewbonney 0:07919e3d6c56 1508 encPtr = enc->encPtr;
andrewbonney 0:07919e3d6c56 1509 if (ptr + 1 == end) {
andrewbonney 0:07919e3d6c56 1510 /* only a single byte available for auto-detection */
andrewbonney 0:07919e3d6c56 1511 #ifndef XML_DTD /* FIXME */
andrewbonney 0:07919e3d6c56 1512 /* a well-formed document entity must have more than one byte */
andrewbonney 0:07919e3d6c56 1513 if (state != XML_CONTENT_STATE)
andrewbonney 0:07919e3d6c56 1514 return XML_TOK_PARTIAL;
andrewbonney 0:07919e3d6c56 1515 #endif
andrewbonney 0:07919e3d6c56 1516 /* so we're parsing an external text entity... */
andrewbonney 0:07919e3d6c56 1517 /* if UTF-16 was externally specified, then we need at least 2 bytes */
andrewbonney 0:07919e3d6c56 1518 switch (INIT_ENC_INDEX(enc)) {
andrewbonney 0:07919e3d6c56 1519 case UTF_16_ENC:
andrewbonney 0:07919e3d6c56 1520 case UTF_16LE_ENC:
andrewbonney 0:07919e3d6c56 1521 case UTF_16BE_ENC:
andrewbonney 0:07919e3d6c56 1522 return XML_TOK_PARTIAL;
andrewbonney 0:07919e3d6c56 1523 }
andrewbonney 0:07919e3d6c56 1524 switch ((unsigned char)*ptr) {
andrewbonney 0:07919e3d6c56 1525 case 0xFE:
andrewbonney 0:07919e3d6c56 1526 case 0xFF:
andrewbonney 0:07919e3d6c56 1527 case 0xEF: /* possibly first byte of UTF-8 BOM */
andrewbonney 0:07919e3d6c56 1528 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
andrewbonney 0:07919e3d6c56 1529 && state == XML_CONTENT_STATE)
andrewbonney 0:07919e3d6c56 1530 break;
andrewbonney 0:07919e3d6c56 1531 /* fall through */
andrewbonney 0:07919e3d6c56 1532 case 0x00:
andrewbonney 0:07919e3d6c56 1533 case 0x3C:
andrewbonney 0:07919e3d6c56 1534 return XML_TOK_PARTIAL;
andrewbonney 0:07919e3d6c56 1535 }
andrewbonney 0:07919e3d6c56 1536 }
andrewbonney 0:07919e3d6c56 1537 else {
andrewbonney 0:07919e3d6c56 1538 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
andrewbonney 0:07919e3d6c56 1539 case 0xFEFF:
andrewbonney 0:07919e3d6c56 1540 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
andrewbonney 0:07919e3d6c56 1541 && state == XML_CONTENT_STATE)
andrewbonney 0:07919e3d6c56 1542 break;
andrewbonney 0:07919e3d6c56 1543 *nextTokPtr = ptr + 2;
andrewbonney 0:07919e3d6c56 1544 *encPtr = encodingTable[UTF_16BE_ENC];
andrewbonney 0:07919e3d6c56 1545 return XML_TOK_BOM;
andrewbonney 0:07919e3d6c56 1546 /* 00 3C is handled in the default case */
andrewbonney 0:07919e3d6c56 1547 case 0x3C00:
andrewbonney 0:07919e3d6c56 1548 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
andrewbonney 0:07919e3d6c56 1549 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
andrewbonney 0:07919e3d6c56 1550 && state == XML_CONTENT_STATE)
andrewbonney 0:07919e3d6c56 1551 break;
andrewbonney 0:07919e3d6c56 1552 *encPtr = encodingTable[UTF_16LE_ENC];
andrewbonney 0:07919e3d6c56 1553 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
andrewbonney 0:07919e3d6c56 1554 case 0xFFFE:
andrewbonney 0:07919e3d6c56 1555 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
andrewbonney 0:07919e3d6c56 1556 && state == XML_CONTENT_STATE)
andrewbonney 0:07919e3d6c56 1557 break;
andrewbonney 0:07919e3d6c56 1558 *nextTokPtr = ptr + 2;
andrewbonney 0:07919e3d6c56 1559 *encPtr = encodingTable[UTF_16LE_ENC];
andrewbonney 0:07919e3d6c56 1560 return XML_TOK_BOM;
andrewbonney 0:07919e3d6c56 1561 case 0xEFBB:
andrewbonney 0:07919e3d6c56 1562 /* Maybe a UTF-8 BOM (EF BB BF) */
andrewbonney 0:07919e3d6c56 1563 /* If there's an explicitly specified (external) encoding
andrewbonney 0:07919e3d6c56 1564 of ISO-8859-1 or some flavour of UTF-16
andrewbonney 0:07919e3d6c56 1565 and this is an external text entity,
andrewbonney 0:07919e3d6c56 1566 don't look for the BOM,
andrewbonney 0:07919e3d6c56 1567 because it might be a legal data.
andrewbonney 0:07919e3d6c56 1568 */
andrewbonney 0:07919e3d6c56 1569 if (state == XML_CONTENT_STATE) {
andrewbonney 0:07919e3d6c56 1570 int e = INIT_ENC_INDEX(enc);
andrewbonney 0:07919e3d6c56 1571 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
andrewbonney 0:07919e3d6c56 1572 || e == UTF_16LE_ENC || e == UTF_16_ENC)
andrewbonney 0:07919e3d6c56 1573 break;
andrewbonney 0:07919e3d6c56 1574 }
andrewbonney 0:07919e3d6c56 1575 if (ptr + 2 == end)
andrewbonney 0:07919e3d6c56 1576 return XML_TOK_PARTIAL;
andrewbonney 0:07919e3d6c56 1577 if ((unsigned char)ptr[2] == 0xBF) {
andrewbonney 0:07919e3d6c56 1578 *nextTokPtr = ptr + 3;
andrewbonney 0:07919e3d6c56 1579 *encPtr = encodingTable[UTF_8_ENC];
andrewbonney 0:07919e3d6c56 1580 return XML_TOK_BOM;
andrewbonney 0:07919e3d6c56 1581 }
andrewbonney 0:07919e3d6c56 1582 break;
andrewbonney 0:07919e3d6c56 1583 default:
andrewbonney 0:07919e3d6c56 1584 if (ptr[0] == '\0') {
andrewbonney 0:07919e3d6c56 1585 /* 0 isn't a legal data character. Furthermore a document
andrewbonney 0:07919e3d6c56 1586 entity can only start with ASCII characters. So the only
andrewbonney 0:07919e3d6c56 1587 way this can fail to be big-endian UTF-16 if it it's an
andrewbonney 0:07919e3d6c56 1588 external parsed general entity that's labelled as
andrewbonney 0:07919e3d6c56 1589 UTF-16LE.
andrewbonney 0:07919e3d6c56 1590 */
andrewbonney 0:07919e3d6c56 1591 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
andrewbonney 0:07919e3d6c56 1592 break;
andrewbonney 0:07919e3d6c56 1593 *encPtr = encodingTable[UTF_16BE_ENC];
andrewbonney 0:07919e3d6c56 1594 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
andrewbonney 0:07919e3d6c56 1595 }
andrewbonney 0:07919e3d6c56 1596 else if (ptr[1] == '\0') {
andrewbonney 0:07919e3d6c56 1597 /* We could recover here in the case:
andrewbonney 0:07919e3d6c56 1598 - parsing an external entity
andrewbonney 0:07919e3d6c56 1599 - second byte is 0
andrewbonney 0:07919e3d6c56 1600 - no externally specified encoding
andrewbonney 0:07919e3d6c56 1601 - no encoding declaration
andrewbonney 0:07919e3d6c56 1602 by assuming UTF-16LE. But we don't, because this would mean when
andrewbonney 0:07919e3d6c56 1603 presented just with a single byte, we couldn't reliably determine
andrewbonney 0:07919e3d6c56 1604 whether we needed further bytes.
andrewbonney 0:07919e3d6c56 1605 */
andrewbonney 0:07919e3d6c56 1606 if (state == XML_CONTENT_STATE)
andrewbonney 0:07919e3d6c56 1607 break;
andrewbonney 0:07919e3d6c56 1608 *encPtr = encodingTable[UTF_16LE_ENC];
andrewbonney 0:07919e3d6c56 1609 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
andrewbonney 0:07919e3d6c56 1610 }
andrewbonney 0:07919e3d6c56 1611 break;
andrewbonney 0:07919e3d6c56 1612 }
andrewbonney 0:07919e3d6c56 1613 }
andrewbonney 0:07919e3d6c56 1614 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
andrewbonney 0:07919e3d6c56 1615 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
andrewbonney 0:07919e3d6c56 1616 }
andrewbonney 0:07919e3d6c56 1617
andrewbonney 0:07919e3d6c56 1618
andrewbonney 0:07919e3d6c56 1619 #define NS(x) x
andrewbonney 0:07919e3d6c56 1620 #define ns(x) x
andrewbonney 0:07919e3d6c56 1621 #define XML_TOK_NS_C
andrewbonney 0:07919e3d6c56 1622 #include "xmltok_ns.c"
andrewbonney 0:07919e3d6c56 1623 #undef XML_TOK_NS_C
andrewbonney 0:07919e3d6c56 1624 #undef NS
andrewbonney 0:07919e3d6c56 1625 #undef ns
andrewbonney 0:07919e3d6c56 1626
andrewbonney 0:07919e3d6c56 1627 #ifdef XML_NS
andrewbonney 0:07919e3d6c56 1628
andrewbonney 0:07919e3d6c56 1629 #define NS(x) x ## NS
andrewbonney 0:07919e3d6c56 1630 #define ns(x) x ## _ns
andrewbonney 0:07919e3d6c56 1631
andrewbonney 0:07919e3d6c56 1632 #define XML_TOK_NS_C
andrewbonney 0:07919e3d6c56 1633 #include "xmltok_ns.c"
andrewbonney 0:07919e3d6c56 1634 #undef XML_TOK_NS_C
andrewbonney 0:07919e3d6c56 1635
andrewbonney 0:07919e3d6c56 1636 #undef NS
andrewbonney 0:07919e3d6c56 1637 #undef ns
andrewbonney 0:07919e3d6c56 1638
andrewbonney 0:07919e3d6c56 1639 ENCODING *
andrewbonney 0:07919e3d6c56 1640 XmlInitUnknownEncodingNS(void *mem,
andrewbonney 0:07919e3d6c56 1641 int *table,
andrewbonney 0:07919e3d6c56 1642 CONVERTER convert,
andrewbonney 0:07919e3d6c56 1643 void *userData)
andrewbonney 0:07919e3d6c56 1644 {
andrewbonney 0:07919e3d6c56 1645 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
andrewbonney 0:07919e3d6c56 1646 if (enc)
andrewbonney 0:07919e3d6c56 1647 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
andrewbonney 0:07919e3d6c56 1648 return enc;
andrewbonney 0:07919e3d6c56 1649 }
andrewbonney 0:07919e3d6c56 1650
andrewbonney 0:07919e3d6c56 1651 #endif /* XML_NS */