bosko lekovic
/
EthToCom_11
nova proba
dxml.cpp@28:49bafc8bb056, 2021-01-18 (annotated)
- Committer:
- Bosko Lekovic
- Date:
- Mon Jan 18 14:46:46 2021 +0100
- Revision:
- 28:49bafc8bb056
- Parent:
- 4:7abcf4543282
novije
Who changed what in which revision?
User | Revision | Line number | New contents of line |
---|---|---|---|
bosko001 | 4:7abcf4543282 | 1 | |
bosko001 | 4:7abcf4543282 | 2 | // DXML.CPP korigovan 11.04.2020 !!! |
bosko001 | 4:7abcf4543282 | 3 | |
bosko001 | 2:45b351b4fc2a | 4 | #define dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 5 | |
bosko001 | 2:45b351b4fc2a | 6 | #include <stdlib.h> |
bosko001 | 2:45b351b4fc2a | 7 | #include <stdio.h> |
bosko001 | 2:45b351b4fc2a | 8 | #include <stdarg.h> |
bosko001 | 2:45b351b4fc2a | 9 | #include <string.h> |
bosko001 | 2:45b351b4fc2a | 10 | #include <ctype.h> |
bosko001 | 2:45b351b4fc2a | 11 | //#include <unistd.h> |
bosko001 | 2:45b351b4fc2a | 12 | //#include <sys/types.h> |
bosko001 | 2:45b351b4fc2a | 13 | #ifndef dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 14 | #include <sys/mman.h> |
bosko001 | 2:45b351b4fc2a | 15 | #endif // dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 16 | //#include <stat.h> |
bosko001 | 2:45b351b4fc2a | 17 | #include "dxml.h" |
bosko001 | 2:45b351b4fc2a | 18 | |
bosko001 | 2:45b351b4fc2a | 19 | #define dxml_WS "\t\r\n " // whitespace |
bosko001 | 2:45b351b4fc2a | 20 | #define dxml_ERRL 128 // maximum error string length |
bosko001 | 2:45b351b4fc2a | 21 | |
bosko001 | 2:45b351b4fc2a | 22 | char* strdup(const char* literal) |
bosko001 | 2:45b351b4fc2a | 23 | { |
bosko001 | 4:7abcf4543282 | 24 | int l_len = strlen(literal); |
bosko001 | 4:7abcf4543282 | 25 | char* n = (char*)malloc(l_len + 1); |
bosko001 | 2:45b351b4fc2a | 26 | strncpy(n, literal, l_len + 1); |
bosko001 | 2:45b351b4fc2a | 27 | return n; |
bosko001 | 2:45b351b4fc2a | 28 | } |
bosko001 | 2:45b351b4fc2a | 29 | |
bosko001 | 2:45b351b4fc2a | 30 | typedef struct dxml_root *dxml_root_t; |
bosko001 | 2:45b351b4fc2a | 31 | struct dxml_root { // additional data for the root tag |
bosko001 | 2:45b351b4fc2a | 32 | struct dxml xml; // is a super-struct built on top of dxml struct |
bosko001 | 2:45b351b4fc2a | 33 | dxml_t cur; // current xml tree insertion point |
bosko001 | 2:45b351b4fc2a | 34 | char *m; // original xml string |
bosko001 | 2:45b351b4fc2a | 35 | size_t len; // length of allocated memory for mmap, -1 for malloc |
bosko001 | 2:45b351b4fc2a | 36 | char *u; // UTF-8 conversion of string if original was UTF-16 |
bosko001 | 2:45b351b4fc2a | 37 | char *s; // start of work area |
bosko001 | 2:45b351b4fc2a | 38 | char *e; // end of work area |
bosko001 | 2:45b351b4fc2a | 39 | char **ent; // general entities (ampersand sequences) |
bosko001 | 2:45b351b4fc2a | 40 | char ***attr; // default attributes |
bosko001 | 2:45b351b4fc2a | 41 | char ***pi; // processing instructions |
bosko001 | 2:45b351b4fc2a | 42 | short standalone; // non-zero if <?xml standalone="yes"?> |
bosko001 | 2:45b351b4fc2a | 43 | char err[dxml_ERRL]; // error string |
bosko001 | 2:45b351b4fc2a | 44 | }; |
bosko001 | 2:45b351b4fc2a | 45 | |
bosko001 | 2:45b351b4fc2a | 46 | char *dxml_NIL[] = { NULL }; // empty, null terminated array of strings |
bosko001 | 2:45b351b4fc2a | 47 | |
bosko001 | 2:45b351b4fc2a | 48 | // returns the first child tag with the given name or NULL if not found |
bosko001 | 2:45b351b4fc2a | 49 | dxml_t dxml_child(dxml_t xml, const char *name) |
bosko001 | 2:45b351b4fc2a | 50 | { |
bosko001 | 2:45b351b4fc2a | 51 | xml = (xml) ? xml->child : NULL; |
bosko001 | 2:45b351b4fc2a | 52 | while (xml && strcmp(name, xml->name)) xml = xml->sibling; |
bosko001 | 2:45b351b4fc2a | 53 | return xml; |
bosko001 | 2:45b351b4fc2a | 54 | } |
bosko001 | 2:45b351b4fc2a | 55 | |
bosko001 | 2:45b351b4fc2a | 56 | // returns the Nth tag with the same name in the same subsection or NULL if not |
bosko001 | 2:45b351b4fc2a | 57 | // found |
bosko001 | 2:45b351b4fc2a | 58 | dxml_t dxml_idx(dxml_t xml, int idx) |
bosko001 | 2:45b351b4fc2a | 59 | { |
bosko001 | 2:45b351b4fc2a | 60 | for (; xml && idx; idx--) xml = xml->next; |
bosko001 | 2:45b351b4fc2a | 61 | return xml; |
bosko001 | 2:45b351b4fc2a | 62 | } |
bosko001 | 2:45b351b4fc2a | 63 | |
bosko001 | 2:45b351b4fc2a | 64 | // returns the value of the requested tag attribute or NULL if not found |
bosko001 | 2:45b351b4fc2a | 65 | const char *dxml_attr(dxml_t xml, const char *attr) |
bosko001 | 2:45b351b4fc2a | 66 | { |
bosko001 | 2:45b351b4fc2a | 67 | int i = 0, j = 1; |
bosko001 | 2:45b351b4fc2a | 68 | dxml_root_t root = (dxml_root_t)xml; |
bosko001 | 2:45b351b4fc2a | 69 | |
bosko001 | 2:45b351b4fc2a | 70 | if (! xml || ! xml->attr) return NULL; |
bosko001 | 2:45b351b4fc2a | 71 | while (xml->attr[i] && strcmp(attr, xml->attr[i])) i += 2; |
bosko001 | 2:45b351b4fc2a | 72 | if (xml->attr[i]) return xml->attr[i + 1]; // found attribute |
bosko001 | 2:45b351b4fc2a | 73 | |
bosko001 | 2:45b351b4fc2a | 74 | while (root->xml.parent) root = (dxml_root_t)root->xml.parent; // root tag |
bosko001 | 2:45b351b4fc2a | 75 | for (i = 0; root->attr[i] && strcmp(xml->name, root->attr[i][0]); i++); |
bosko001 | 2:45b351b4fc2a | 76 | if (! root->attr[i]) return NULL; // no matching default attributes |
bosko001 | 2:45b351b4fc2a | 77 | while (root->attr[i][j] && strcmp(attr, root->attr[i][j])) j += 3; |
bosko001 | 2:45b351b4fc2a | 78 | return (root->attr[i][j]) ? root->attr[i][j + 1] : NULL; // found default |
bosko001 | 2:45b351b4fc2a | 79 | } |
bosko001 | 2:45b351b4fc2a | 80 | |
bosko001 | 2:45b351b4fc2a | 81 | // same as dxml_get but takes an already initialized va_list |
bosko001 | 2:45b351b4fc2a | 82 | dxml_t dxml_vget(dxml_t xml, va_list ap) |
bosko001 | 2:45b351b4fc2a | 83 | { |
bosko001 | 2:45b351b4fc2a | 84 | char *name = va_arg(ap, char *); |
bosko001 | 2:45b351b4fc2a | 85 | int idx = -1; |
bosko001 | 2:45b351b4fc2a | 86 | |
bosko001 | 2:45b351b4fc2a | 87 | if (name && *name) { |
bosko001 | 4:7abcf4543282 | 88 | idx = va_arg(ap, int); |
bosko001 | 2:45b351b4fc2a | 89 | xml = dxml_child(xml, name); |
bosko001 | 2:45b351b4fc2a | 90 | } |
bosko001 | 2:45b351b4fc2a | 91 | return (idx < 0) ? xml : dxml_vget(dxml_idx(xml, idx), ap); |
bosko001 | 2:45b351b4fc2a | 92 | } |
bosko001 | 2:45b351b4fc2a | 93 | |
bosko001 | 2:45b351b4fc2a | 94 | // Traverses the xml tree to retrieve a specific subtag. Takes a variable |
bosko001 | 2:45b351b4fc2a | 95 | // length list of tag names and indexes. The argument list must be terminated |
bosko001 | 4:7abcf4543282 | 96 | // by either an index of -1 or an empty string tag name. Example: |
bosko001 | 2:45b351b4fc2a | 97 | // title = dxml_get(library, "shelf", 0, "book", 2, "title", -1); |
bosko001 | 2:45b351b4fc2a | 98 | // This retrieves the title of the 3rd book on the 1st shelf of library. |
bosko001 | 2:45b351b4fc2a | 99 | // Returns NULL if not found. |
bosko001 | 2:45b351b4fc2a | 100 | dxml_t dxml_get(dxml_t xml, ...) |
bosko001 | 2:45b351b4fc2a | 101 | { |
bosko001 | 2:45b351b4fc2a | 102 | va_list ap; |
bosko001 | 2:45b351b4fc2a | 103 | dxml_t r; |
bosko001 | 2:45b351b4fc2a | 104 | |
bosko001 | 2:45b351b4fc2a | 105 | va_start(ap, xml); |
bosko001 | 2:45b351b4fc2a | 106 | r = dxml_vget(xml, ap); |
bosko001 | 2:45b351b4fc2a | 107 | va_end(ap); |
bosko001 | 2:45b351b4fc2a | 108 | return r; |
bosko001 | 2:45b351b4fc2a | 109 | } |
bosko001 | 2:45b351b4fc2a | 110 | |
bosko001 | 2:45b351b4fc2a | 111 | // returns a null terminated array of processing instructions for the given |
bosko001 | 2:45b351b4fc2a | 112 | // target |
bosko001 | 2:45b351b4fc2a | 113 | const char **dxml_pi(dxml_t xml, const char *target) |
bosko001 | 2:45b351b4fc2a | 114 | { |
bosko001 | 2:45b351b4fc2a | 115 | dxml_root_t root = (dxml_root_t)xml; |
bosko001 | 2:45b351b4fc2a | 116 | int i = 0; |
bosko001 | 2:45b351b4fc2a | 117 | |
bosko001 | 2:45b351b4fc2a | 118 | if (! root) return (const char **)dxml_NIL; |
bosko001 | 2:45b351b4fc2a | 119 | while (root->xml.parent) root = (dxml_root_t)root->xml.parent; // root tag |
bosko001 | 2:45b351b4fc2a | 120 | while (root->pi[i] && strcmp(target, root->pi[i][0])) i++; // find target |
bosko001 | 2:45b351b4fc2a | 121 | return (const char **)((root->pi[i]) ? root->pi[i] + 1 : dxml_NIL); |
bosko001 | 2:45b351b4fc2a | 122 | } |
bosko001 | 2:45b351b4fc2a | 123 | |
bosko001 | 2:45b351b4fc2a | 124 | // set an error string and return root |
bosko001 | 2:45b351b4fc2a | 125 | dxml_t dxml_err(dxml_root_t root, char *s, const char *err, ...) |
bosko001 | 2:45b351b4fc2a | 126 | { |
bosko001 | 2:45b351b4fc2a | 127 | va_list ap; |
bosko001 | 2:45b351b4fc2a | 128 | int line = 1; |
bosko001 | 2:45b351b4fc2a | 129 | char *t, fmt[dxml_ERRL]; |
bosko001 | 4:7abcf4543282 | 130 | |
bosko001 | 2:45b351b4fc2a | 131 | for (t = root->s; t < s; t++) if (*t == '\n') line++; |
bosko001 | 2:45b351b4fc2a | 132 | snprintf(fmt, dxml_ERRL, "[error near line %d]: %s", line, err); |
bosko001 | 2:45b351b4fc2a | 133 | |
bosko001 | 2:45b351b4fc2a | 134 | va_start(ap, err); |
bosko001 | 2:45b351b4fc2a | 135 | vsnprintf(root->err, dxml_ERRL, fmt, ap); |
bosko001 | 2:45b351b4fc2a | 136 | va_end(ap); |
bosko001 | 2:45b351b4fc2a | 137 | |
bosko001 | 2:45b351b4fc2a | 138 | return &root->xml; |
bosko001 | 2:45b351b4fc2a | 139 | } |
bosko001 | 2:45b351b4fc2a | 140 | |
bosko001 | 2:45b351b4fc2a | 141 | // Recursively decodes entity and character references and normalizes new lines |
bosko001 | 2:45b351b4fc2a | 142 | // ent is a null terminated array of alternating entity names and values. set t |
bosko001 | 2:45b351b4fc2a | 143 | // to '&' for general entity decoding, '%' for parameter entity decoding, 'c' |
bosko001 | 2:45b351b4fc2a | 144 | // for cdata sections, ' ' for attribute normalization, or '*' for non-cdata |
bosko001 | 2:45b351b4fc2a | 145 | // attribute normalization. Returns s, or if the decoded string is longer than |
bosko001 | 2:45b351b4fc2a | 146 | // s, returns a malloced string that must be freed. |
bosko001 | 2:45b351b4fc2a | 147 | char *dxml_decode(char *s, char **ent, char t) |
bosko001 | 2:45b351b4fc2a | 148 | { |
bosko001 | 2:45b351b4fc2a | 149 | char *e, *r = s, *m = s; |
bosko001 | 2:45b351b4fc2a | 150 | long b, c, d, l; |
bosko001 | 2:45b351b4fc2a | 151 | |
bosko001 | 2:45b351b4fc2a | 152 | for (; *s; s++) { // normalize line endings |
bosko001 | 2:45b351b4fc2a | 153 | while (*s == '\r') { |
bosko001 | 2:45b351b4fc2a | 154 | *(s++) = '\n'; |
bosko001 | 2:45b351b4fc2a | 155 | if (*s == '\n') memmove(s, (s + 1), strlen(s)); |
bosko001 | 2:45b351b4fc2a | 156 | } |
bosko001 | 2:45b351b4fc2a | 157 | } |
bosko001 | 4:7abcf4543282 | 158 | |
bosko001 | 2:45b351b4fc2a | 159 | for (s = r; ; ) { |
bosko001 | 2:45b351b4fc2a | 160 | while (*s && *s != '&' && (*s != '%' || t != '%') && !isspace(*s)) s++; |
bosko001 | 2:45b351b4fc2a | 161 | |
bosko001 | 2:45b351b4fc2a | 162 | if (! *s) break; |
bosko001 | 2:45b351b4fc2a | 163 | else if (t != 'c' && ! strncmp(s, "&#", 2)) { // character reference |
bosko001 | 2:45b351b4fc2a | 164 | if (s[2] == 'x') c = strtol(s + 3, &e, 16); // base 16 |
bosko001 | 2:45b351b4fc2a | 165 | else c = strtol(s + 2, &e, 10); // base 10 |
bosko001 | 2:45b351b4fc2a | 166 | if (! c || *e != ';') { s++; continue; } // not a character ref |
bosko001 | 2:45b351b4fc2a | 167 | |
bosko001 | 2:45b351b4fc2a | 168 | if (c < 0x80) *(s++) = c; // US-ASCII subset |
bosko001 | 2:45b351b4fc2a | 169 | else { // multi-byte UTF-8 sequence |
bosko001 | 2:45b351b4fc2a | 170 | for (b = 0, d = c; d; d /= 2) b++; // number of bits in c |
bosko001 | 2:45b351b4fc2a | 171 | b = (b - 2) / 5; // number of bytes in payload |
bosko001 | 2:45b351b4fc2a | 172 | *(s++) = (0xFF << (7 - b)) | (c >> (6 * b)); // head |
bosko001 | 2:45b351b4fc2a | 173 | while (b) *(s++) = 0x80 | ((c >> (6 * --b)) & 0x3F); // payload |
bosko001 | 2:45b351b4fc2a | 174 | } |
bosko001 | 2:45b351b4fc2a | 175 | |
bosko001 | 2:45b351b4fc2a | 176 | memmove(s, strchr(s, ';') + 1, strlen(strchr(s, ';'))); |
bosko001 | 2:45b351b4fc2a | 177 | } |
bosko001 | 2:45b351b4fc2a | 178 | else if ((*s == '&' && (t == '&' || t == ' ' || t == '*')) || |
bosko001 | 2:45b351b4fc2a | 179 | (*s == '%' && t == '%')) { // entity reference |
bosko001 | 2:45b351b4fc2a | 180 | for (b = 0; ent[b] && strncmp(s + 1, ent[b], strlen(ent[b])); |
bosko001 | 2:45b351b4fc2a | 181 | b += 2); // find entity in entity list |
bosko001 | 2:45b351b4fc2a | 182 | |
bosko001 | 2:45b351b4fc2a | 183 | if (ent[b++]) { // found a match |
bosko001 | 2:45b351b4fc2a | 184 | if ((c = strlen(ent[b])) - 1 > (e = strchr(s, ';')) - s) { |
bosko001 | 2:45b351b4fc2a | 185 | l = (d = (s - r)) + c + strlen(e); // new length |
bosko001 | 2:45b351b4fc2a | 186 | r = (char*)((r == m) ? strcpy((char*)malloc(l), r) : realloc(r, l)); |
bosko001 | 2:45b351b4fc2a | 187 | e = strchr((s = r + d), ';'); // fix up pointers |
bosko001 | 2:45b351b4fc2a | 188 | } |
bosko001 | 2:45b351b4fc2a | 189 | |
bosko001 | 2:45b351b4fc2a | 190 | memmove(s + c, e + 1, strlen(e)); // shift rest of string |
bosko001 | 2:45b351b4fc2a | 191 | strncpy(s, ent[b], c); // copy in replacement text |
bosko001 | 2:45b351b4fc2a | 192 | } |
bosko001 | 2:45b351b4fc2a | 193 | else s++; // not a known entity |
bosko001 | 2:45b351b4fc2a | 194 | } |
bosko001 | 2:45b351b4fc2a | 195 | else if ((t == ' ' || t == '*') && isspace(*s)) *(s++) = ' '; |
bosko001 | 2:45b351b4fc2a | 196 | else s++; // no decoding needed |
bosko001 | 2:45b351b4fc2a | 197 | } |
bosko001 | 2:45b351b4fc2a | 198 | |
bosko001 | 2:45b351b4fc2a | 199 | if (t == '*') { // normalize spaces for non-cdata attributes |
bosko001 | 2:45b351b4fc2a | 200 | for (s = r; *s; s++) { |
bosko001 | 2:45b351b4fc2a | 201 | if ((l = strspn(s, " "))) memmove(s, s + l, strlen(s + l) + 1); |
bosko001 | 2:45b351b4fc2a | 202 | while (*s && *s != ' ') s++; |
bosko001 | 2:45b351b4fc2a | 203 | } |
bosko001 | 2:45b351b4fc2a | 204 | if (--s >= r && *s == ' ') *s = '\0'; // trim any trailing space |
bosko001 | 2:45b351b4fc2a | 205 | } |
bosko001 | 2:45b351b4fc2a | 206 | return r; |
bosko001 | 2:45b351b4fc2a | 207 | } |
bosko001 | 2:45b351b4fc2a | 208 | |
bosko001 | 2:45b351b4fc2a | 209 | // called when parser finds start of new tag |
bosko001 | 2:45b351b4fc2a | 210 | void dxml_open_tag(dxml_root_t root, char *name, char **attr) |
bosko001 | 2:45b351b4fc2a | 211 | { |
bosko001 | 2:45b351b4fc2a | 212 | dxml_t xml = root->cur; |
bosko001 | 4:7abcf4543282 | 213 | |
bosko001 | 2:45b351b4fc2a | 214 | if (xml->name) xml = dxml_add_child(xml, name, strlen(xml->txt)); |
bosko001 | 2:45b351b4fc2a | 215 | else xml->name = name; // first open tag |
bosko001 | 2:45b351b4fc2a | 216 | |
bosko001 | 2:45b351b4fc2a | 217 | xml->attr = attr; |
bosko001 | 2:45b351b4fc2a | 218 | root->cur = xml; // update tag insertion point |
bosko001 | 2:45b351b4fc2a | 219 | } |
bosko001 | 2:45b351b4fc2a | 220 | |
bosko001 | 2:45b351b4fc2a | 221 | // called when parser finds character content between open and closing tag |
bosko001 | 2:45b351b4fc2a | 222 | void dxml_char_content(dxml_root_t root, char *s, size_t len, char t) |
bosko001 | 2:45b351b4fc2a | 223 | { |
bosko001 | 2:45b351b4fc2a | 224 | dxml_t xml = root->cur; |
bosko001 | 2:45b351b4fc2a | 225 | char *m = s; |
bosko001 | 2:45b351b4fc2a | 226 | size_t l; |
bosko001 | 2:45b351b4fc2a | 227 | |
bosko001 | 2:45b351b4fc2a | 228 | if (! xml || ! xml->name || ! len) return; // sanity check |
bosko001 | 2:45b351b4fc2a | 229 | |
bosko001 | 2:45b351b4fc2a | 230 | s[len] = '\0'; // null terminate text (calling functions anticipate this) |
bosko001 | 2:45b351b4fc2a | 231 | len = strlen(s = dxml_decode(s, root->ent, t)) + 1; |
bosko001 | 2:45b351b4fc2a | 232 | |
bosko001 | 2:45b351b4fc2a | 233 | if (! *(xml->txt)) xml->txt = s; // initial character content |
bosko001 | 2:45b351b4fc2a | 234 | else { // allocate our own memory and make a copy |
bosko001 | 2:45b351b4fc2a | 235 | xml->txt = (char*)((xml->flags & dxml_TXTM) // allocate some space |
bosko001 | 2:45b351b4fc2a | 236 | ? realloc(xml->txt, (l = strlen(xml->txt)) + len) |
bosko001 | 2:45b351b4fc2a | 237 | : strcpy((char*)malloc((l = strlen(xml->txt)) + len), xml->txt)); |
bosko001 | 2:45b351b4fc2a | 238 | strcpy(xml->txt + l, s); // add new char content |
bosko001 | 2:45b351b4fc2a | 239 | if (s != m) free(s); // free s if it was malloced by dxml_decode() |
bosko001 | 2:45b351b4fc2a | 240 | } |
bosko001 | 2:45b351b4fc2a | 241 | |
bosko001 | 2:45b351b4fc2a | 242 | if (xml->txt != m) dxml_set_flag(xml, dxml_TXTM); |
bosko001 | 2:45b351b4fc2a | 243 | } |
bosko001 | 2:45b351b4fc2a | 244 | |
bosko001 | 2:45b351b4fc2a | 245 | // called when parser finds closing tag |
bosko001 | 2:45b351b4fc2a | 246 | dxml_t dxml_close_tag(dxml_root_t root, char *name, char *s) |
bosko001 | 2:45b351b4fc2a | 247 | { |
bosko001 | 2:45b351b4fc2a | 248 | if (! root->cur || ! root->cur->name || strcmp(name, root->cur->name)) |
bosko001 | 2:45b351b4fc2a | 249 | return dxml_err(root, s, "unexpected closing tag </%s>", name); |
bosko001 | 2:45b351b4fc2a | 250 | |
bosko001 | 2:45b351b4fc2a | 251 | root->cur = root->cur->parent; |
bosko001 | 2:45b351b4fc2a | 252 | return NULL; |
bosko001 | 2:45b351b4fc2a | 253 | } |
bosko001 | 2:45b351b4fc2a | 254 | |
bosko001 | 2:45b351b4fc2a | 255 | // checks for circular entity references, returns non-zero if no circular |
bosko001 | 2:45b351b4fc2a | 256 | // references are found, zero otherwise |
bosko001 | 2:45b351b4fc2a | 257 | int dxml_ent_ok(char *name, char *s, char **ent) |
bosko001 | 2:45b351b4fc2a | 258 | { |
bosko001 | 2:45b351b4fc2a | 259 | int i; |
bosko001 | 2:45b351b4fc2a | 260 | |
bosko001 | 2:45b351b4fc2a | 261 | for (; ; s++) { |
bosko001 | 2:45b351b4fc2a | 262 | while (*s && *s != '&') s++; // find next entity reference |
bosko001 | 2:45b351b4fc2a | 263 | if (! *s) return 1; |
bosko001 | 2:45b351b4fc2a | 264 | if (! strncmp(s + 1, name, strlen(name))) return 0; // circular ref. |
bosko001 | 2:45b351b4fc2a | 265 | for (i = 0; ent[i] && strncmp(ent[i], s + 1, strlen(ent[i])); i += 2); |
bosko001 | 2:45b351b4fc2a | 266 | if (ent[i] && ! dxml_ent_ok(name, ent[i + 1], ent)) return 0; |
bosko001 | 2:45b351b4fc2a | 267 | } |
bosko001 | 2:45b351b4fc2a | 268 | } |
bosko001 | 2:45b351b4fc2a | 269 | |
bosko001 | 2:45b351b4fc2a | 270 | // called when the parser finds a processing instruction |
bosko001 | 2:45b351b4fc2a | 271 | void dxml_proc_inst(dxml_root_t root, char *s, size_t len) |
bosko001 | 2:45b351b4fc2a | 272 | { |
bosko001 | 2:45b351b4fc2a | 273 | int i = 0, j = 1; |
bosko001 | 2:45b351b4fc2a | 274 | char *target = s; |
bosko001 | 2:45b351b4fc2a | 275 | |
bosko001 | 2:45b351b4fc2a | 276 | s[len] = '\0'; // null terminate instruction |
bosko001 | 2:45b351b4fc2a | 277 | if (*(s += strcspn(s, dxml_WS))) { |
bosko001 | 2:45b351b4fc2a | 278 | *s = '\0'; // null terminate target |
bosko001 | 2:45b351b4fc2a | 279 | s += strspn(s + 1, dxml_WS) + 1; // skip whitespace after target |
bosko001 | 2:45b351b4fc2a | 280 | } |
bosko001 | 2:45b351b4fc2a | 281 | |
bosko001 | 2:45b351b4fc2a | 282 | if (! strcmp(target, "xml")) { // <?xml ... ?> |
bosko001 | 2:45b351b4fc2a | 283 | if ((s = strstr(s, "standalone")) && ! strncmp(s + strspn(s + 10, |
bosko001 | 2:45b351b4fc2a | 284 | dxml_WS "='\"") + 10, "yes", 3)) root->standalone = 1; |
bosko001 | 2:45b351b4fc2a | 285 | return; |
bosko001 | 2:45b351b4fc2a | 286 | } |
bosko001 | 2:45b351b4fc2a | 287 | |
bosko001 | 2:45b351b4fc2a | 288 | if (! root->pi[0]) *(root->pi = (char***)malloc(sizeof(char **))) = NULL; //first pi |
bosko001 | 2:45b351b4fc2a | 289 | |
bosko001 | 2:45b351b4fc2a | 290 | while (root->pi[i] && strcmp(target, root->pi[i][0])) i++; // find target |
bosko001 | 2:45b351b4fc2a | 291 | if (! root->pi[i]) { // new target |
bosko001 | 2:45b351b4fc2a | 292 | root->pi = (char***)realloc(root->pi, sizeof(char **) * (i + 2)); |
bosko001 | 2:45b351b4fc2a | 293 | root->pi[i] = (char**)malloc(sizeof(char *) * 3); |
bosko001 | 2:45b351b4fc2a | 294 | root->pi[i][0] = target; |
bosko001 | 2:45b351b4fc2a | 295 | root->pi[i][1] = (char *)(root->pi[i + 1] = NULL); // terminate pi list |
bosko001 | 2:45b351b4fc2a | 296 | root->pi[i][2] = strdup(""); // empty document position list |
bosko001 | 2:45b351b4fc2a | 297 | } |
bosko001 | 2:45b351b4fc2a | 298 | |
bosko001 | 2:45b351b4fc2a | 299 | while (root->pi[i][j]) j++; // find end of instruction list for this target |
bosko001 | 2:45b351b4fc2a | 300 | root->pi[i] = (char**)realloc(root->pi[i], sizeof(char *) * (j + 3)); |
bosko001 | 2:45b351b4fc2a | 301 | root->pi[i][j + 2] = (char*)realloc(root->pi[i][j + 1], j + 1); |
bosko001 | 2:45b351b4fc2a | 302 | strcpy(root->pi[i][j + 2] + j - 1, (root->xml.name) ? ">" : "<"); |
bosko001 | 2:45b351b4fc2a | 303 | root->pi[i][j + 1] = NULL; // null terminate pi list for this target |
bosko001 | 2:45b351b4fc2a | 304 | root->pi[i][j] = s; // set instruction |
bosko001 | 2:45b351b4fc2a | 305 | } |
bosko001 | 2:45b351b4fc2a | 306 | |
bosko001 | 2:45b351b4fc2a | 307 | // called when the parser finds an internal doctype subset |
bosko001 | 2:45b351b4fc2a | 308 | short dxml_internal_dtd(dxml_root_t root, char *s, size_t len) |
bosko001 | 2:45b351b4fc2a | 309 | { |
bosko001 | 2:45b351b4fc2a | 310 | char q, *c, *t, *n = NULL, *v, **ent, **pe; |
bosko001 | 2:45b351b4fc2a | 311 | int i, j; |
bosko001 | 4:7abcf4543282 | 312 | |
bosko001 | 2:45b351b4fc2a | 313 | pe = (char**)memcpy(malloc(sizeof(dxml_NIL)), dxml_NIL, sizeof(dxml_NIL)); |
bosko001 | 2:45b351b4fc2a | 314 | |
bosko001 | 2:45b351b4fc2a | 315 | for (s[len] = '\0'; s; ) { |
bosko001 | 2:45b351b4fc2a | 316 | while (*s && *s != '<' && *s != '%') s++; // find next declaration |
bosko001 | 2:45b351b4fc2a | 317 | |
bosko001 | 2:45b351b4fc2a | 318 | if (! *s) break; |
bosko001 | 2:45b351b4fc2a | 319 | else if (! strncmp(s, "<!ENTITY", 8)) { // parse entity definitions |
bosko001 | 2:45b351b4fc2a | 320 | c = s += strspn(s + 8, dxml_WS) + 8; // skip white space separator |
bosko001 | 2:45b351b4fc2a | 321 | n = s + strspn(s, dxml_WS "%"); // find name |
bosko001 | 2:45b351b4fc2a | 322 | *(s = n + strcspn(n, dxml_WS)) = ';'; // append ; to name |
bosko001 | 2:45b351b4fc2a | 323 | |
bosko001 | 2:45b351b4fc2a | 324 | v = s + strspn(s + 1, dxml_WS) + 1; // find value |
bosko001 | 2:45b351b4fc2a | 325 | if ((q = *(v++)) != '"' && q != '\'') { // skip externals |
bosko001 | 2:45b351b4fc2a | 326 | s = strchr(s, '>'); |
bosko001 | 2:45b351b4fc2a | 327 | continue; |
bosko001 | 2:45b351b4fc2a | 328 | } |
bosko001 | 2:45b351b4fc2a | 329 | |
bosko001 | 2:45b351b4fc2a | 330 | for (i = 0, ent = (*c == '%') ? pe : root->ent; ent[i]; i++); |
bosko001 | 2:45b351b4fc2a | 331 | ent = (char**)realloc(ent, (i + 3) * sizeof(char *)); // space for next ent |
bosko001 | 2:45b351b4fc2a | 332 | if (*c == '%') pe = ent; |
bosko001 | 2:45b351b4fc2a | 333 | else root->ent = ent; |
bosko001 | 2:45b351b4fc2a | 334 | |
bosko001 | 2:45b351b4fc2a | 335 | *(++s) = '\0'; // null terminate name |
bosko001 | 2:45b351b4fc2a | 336 | if ((s = strchr(v, q))) *(s++) = '\0'; // null terminate value |
bosko001 | 2:45b351b4fc2a | 337 | ent[i + 1] = dxml_decode(v, pe, '%'); // set value |
bosko001 | 2:45b351b4fc2a | 338 | ent[i + 2] = NULL; // null terminate entity list |
bosko001 | 2:45b351b4fc2a | 339 | if (! dxml_ent_ok(n, ent[i + 1], ent)) { // circular reference |
bosko001 | 2:45b351b4fc2a | 340 | if (ent[i + 1] != v) free(ent[i + 1]); |
bosko001 | 2:45b351b4fc2a | 341 | dxml_err(root, v, "circular entity declaration &%s", n); |
bosko001 | 2:45b351b4fc2a | 342 | break; |
bosko001 | 2:45b351b4fc2a | 343 | } |
bosko001 | 2:45b351b4fc2a | 344 | else ent[i] = n; // set entity name |
bosko001 | 2:45b351b4fc2a | 345 | } |
bosko001 | 2:45b351b4fc2a | 346 | else if (! strncmp(s, "<!ATTLIST", 9)) { // parse default attributes |
bosko001 | 2:45b351b4fc2a | 347 | t = s + strspn(s + 9, dxml_WS) + 9; // skip whitespace separator |
bosko001 | 2:45b351b4fc2a | 348 | if (! *t) { dxml_err(root, t, "unclosed <!ATTLIST"); break; } |
bosko001 | 2:45b351b4fc2a | 349 | if (*(s = t + strcspn(t, dxml_WS ">")) == '>') continue; |
bosko001 | 2:45b351b4fc2a | 350 | else *s = '\0'; // null terminate tag name |
bosko001 | 2:45b351b4fc2a | 351 | for (i = 0; root->attr[i] && strcmp(n, root->attr[i][0]); i++); |
bosko001 | 2:45b351b4fc2a | 352 | |
bosko001 | 2:45b351b4fc2a | 353 | while (*(n = ++s + strspn(s, dxml_WS)) && *n != '>') { |
bosko001 | 2:45b351b4fc2a | 354 | if (*(s = n + strcspn(n, dxml_WS))) *s = '\0'; // attr name |
bosko001 | 2:45b351b4fc2a | 355 | else { dxml_err(root, t, "malformed <!ATTLIST"); break; } |
bosko001 | 2:45b351b4fc2a | 356 | |
bosko001 | 2:45b351b4fc2a | 357 | s += strspn(s + 1, dxml_WS) + 1; // find next token |
bosko001 | 2:45b351b4fc2a | 358 | c = strdup((strncmp(s, "CDATA", 5)) ? "*" : " "); // is it cdata? |
bosko001 | 2:45b351b4fc2a | 359 | if (! strncmp(s, "NOTATION", 8)) |
bosko001 | 2:45b351b4fc2a | 360 | s += strspn(s + 8, dxml_WS) + 8; |
bosko001 | 2:45b351b4fc2a | 361 | s = (*s == '(') ? strchr(s, ')') : s + strcspn(s, dxml_WS); |
bosko001 | 2:45b351b4fc2a | 362 | if (! s) { dxml_err(root, t, "malformed <!ATTLIST"); break; } |
bosko001 | 2:45b351b4fc2a | 363 | |
bosko001 | 2:45b351b4fc2a | 364 | s += strspn(s, dxml_WS ")"); // skip white space separator |
bosko001 | 2:45b351b4fc2a | 365 | if (! strncmp(s, "#FIXED", 6)) |
bosko001 | 2:45b351b4fc2a | 366 | s += strspn(s + 6, dxml_WS) + 6; |
bosko001 | 2:45b351b4fc2a | 367 | if (*s == '#') { // no default value |
bosko001 | 2:45b351b4fc2a | 368 | s += strcspn(s, dxml_WS ">") - 1; |
bosko001 | 2:45b351b4fc2a | 369 | if (*c == ' ') continue; // cdata is default, nothing to do |
bosko001 | 2:45b351b4fc2a | 370 | v = NULL; |
bosko001 | 2:45b351b4fc2a | 371 | } |
bosko001 | 2:45b351b4fc2a | 372 | else if ((*s == '"' || *s == '\'') && // default value |
bosko001 | 2:45b351b4fc2a | 373 | (s = strchr(v = s + 1, *s))) *s = '\0'; |
bosko001 | 2:45b351b4fc2a | 374 | else { dxml_err(root, t, "malformed <!ATTLIST"); break; } |
bosko001 | 2:45b351b4fc2a | 375 | |
bosko001 | 2:45b351b4fc2a | 376 | if (! root->attr[i]) { // new tag name |
bosko001 | 2:45b351b4fc2a | 377 | root->attr = (char***)((! i) ? malloc(2 * sizeof(char **)) |
bosko001 | 2:45b351b4fc2a | 378 | : realloc(root->attr, |
bosko001 | 2:45b351b4fc2a | 379 | (i + 2) * sizeof(char **))); |
bosko001 | 2:45b351b4fc2a | 380 | root->attr[i] = (char**)malloc(2 * sizeof(char *)); |
bosko001 | 2:45b351b4fc2a | 381 | root->attr[i][0] = t; // set tag name |
bosko001 | 2:45b351b4fc2a | 382 | root->attr[i][1] = (char *)(root->attr[i + 1] = NULL); |
bosko001 | 2:45b351b4fc2a | 383 | } |
bosko001 | 2:45b351b4fc2a | 384 | |
bosko001 | 2:45b351b4fc2a | 385 | for (j = 1; root->attr[i][j]; j += 3); // find end of list |
bosko001 | 2:45b351b4fc2a | 386 | root->attr[i] = (char**)realloc(root->attr[i], |
bosko001 | 2:45b351b4fc2a | 387 | (j + 4) * sizeof(char *)); |
bosko001 | 2:45b351b4fc2a | 388 | |
bosko001 | 2:45b351b4fc2a | 389 | root->attr[i][j + 3] = NULL; // null terminate list |
bosko001 | 2:45b351b4fc2a | 390 | root->attr[i][j + 2] = c; // is it cdata? |
bosko001 | 2:45b351b4fc2a | 391 | root->attr[i][j + 1] = (v) ? dxml_decode(v, root->ent, *c) |
bosko001 | 2:45b351b4fc2a | 392 | : NULL; |
bosko001 | 4:7abcf4543282 | 393 | root->attr[i][j] = n; // attribute name |
bosko001 | 2:45b351b4fc2a | 394 | } |
bosko001 | 2:45b351b4fc2a | 395 | } |
bosko001 | 2:45b351b4fc2a | 396 | else if (! strncmp(s, "<!--", 4)) s = strstr(s + 4, "-->"); // comments |
bosko001 | 2:45b351b4fc2a | 397 | else if (! strncmp(s, "<?", 2)) { // processing instructions |
bosko001 | 2:45b351b4fc2a | 398 | if ((s = strstr(c = s + 2, "?>"))) |
bosko001 | 2:45b351b4fc2a | 399 | dxml_proc_inst(root, c, s++ - c); |
bosko001 | 2:45b351b4fc2a | 400 | } |
bosko001 | 2:45b351b4fc2a | 401 | else if (*s == '<') s = strchr(s, '>'); // skip other declarations |
bosko001 | 2:45b351b4fc2a | 402 | else if (*(s++) == '%' && ! root->standalone) break; |
bosko001 | 2:45b351b4fc2a | 403 | } |
bosko001 | 2:45b351b4fc2a | 404 | |
bosko001 | 2:45b351b4fc2a | 405 | free(pe); |
bosko001 | 2:45b351b4fc2a | 406 | return ! *root->err; |
bosko001 | 2:45b351b4fc2a | 407 | } |
bosko001 | 2:45b351b4fc2a | 408 | |
bosko001 | 2:45b351b4fc2a | 409 | // Converts a UTF-16 string to UTF-8. Returns a new string that must be freed |
bosko001 | 2:45b351b4fc2a | 410 | // or NULL if no conversion was needed. |
bosko001 | 2:45b351b4fc2a | 411 | char *dxml_str2utf8(char **s, size_t *len) |
bosko001 | 2:45b351b4fc2a | 412 | { |
bosko001 | 2:45b351b4fc2a | 413 | char *u; |
bosko001 | 2:45b351b4fc2a | 414 | size_t l = 0, sl, max = *len; |
bosko001 | 2:45b351b4fc2a | 415 | long c, d; |
bosko001 | 2:45b351b4fc2a | 416 | int b, be = (**s == '\xFE') ? 1 : (**s == '\xFF') ? 0 : -1; |
bosko001 | 2:45b351b4fc2a | 417 | |
bosko001 | 2:45b351b4fc2a | 418 | if (be == -1) return NULL; // not UTF-16 |
bosko001 | 2:45b351b4fc2a | 419 | |
bosko001 | 2:45b351b4fc2a | 420 | u = (char*)malloc(max); |
bosko001 | 2:45b351b4fc2a | 421 | for (sl = 2; sl < *len - 1; sl += 2) { |
bosko001 | 2:45b351b4fc2a | 422 | c = (be) ? (((*s)[sl] & 0xFF) << 8) | ((*s)[sl + 1] & 0xFF) //UTF-16BE |
bosko001 | 2:45b351b4fc2a | 423 | : (((*s)[sl + 1] & 0xFF) << 8) | ((*s)[sl] & 0xFF); //UTF-16LE |
bosko001 | 2:45b351b4fc2a | 424 | if (c >= 0xD800 && c <= 0xDFFF && (sl += 2) < *len - 1) { // high-half |
bosko001 | 2:45b351b4fc2a | 425 | d = (be) ? (((*s)[sl] & 0xFF) << 8) | ((*s)[sl + 1] & 0xFF) |
bosko001 | 2:45b351b4fc2a | 426 | : (((*s)[sl + 1] & 0xFF) << 8) | ((*s)[sl] & 0xFF); |
bosko001 | 2:45b351b4fc2a | 427 | c = (((c & 0x3FF) << 10) | (d & 0x3FF)) + 0x10000; |
bosko001 | 2:45b351b4fc2a | 428 | } |
bosko001 | 2:45b351b4fc2a | 429 | |
bosko001 | 2:45b351b4fc2a | 430 | while (l + 6 > max) u = (char*)realloc(u, max += dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 431 | if (c < 0x80) u[l++] = c; // US-ASCII subset |
bosko001 | 2:45b351b4fc2a | 432 | else { // multi-byte UTF-8 sequence |
bosko001 | 2:45b351b4fc2a | 433 | for (b = 0, d = c; d; d /= 2) b++; // bits in c |
bosko001 | 2:45b351b4fc2a | 434 | b = (b - 2) / 5; // bytes in payload |
bosko001 | 2:45b351b4fc2a | 435 | u[l++] = (0xFF << (7 - b)) | (c >> (6 * b)); // head |
bosko001 | 2:45b351b4fc2a | 436 | while (b) u[l++] = 0x80 | ((c >> (6 * --b)) & 0x3F); // payload |
bosko001 | 2:45b351b4fc2a | 437 | } |
bosko001 | 2:45b351b4fc2a | 438 | } |
bosko001 | 2:45b351b4fc2a | 439 | return *s = (char*)realloc(u, *len = l); |
bosko001 | 2:45b351b4fc2a | 440 | } |
bosko001 | 2:45b351b4fc2a | 441 | |
bosko001 | 2:45b351b4fc2a | 442 | // frees a tag attribute list |
bosko001 | 2:45b351b4fc2a | 443 | void dxml_free_attr(char **attr) { |
bosko001 | 2:45b351b4fc2a | 444 | int i = 0; |
bosko001 | 2:45b351b4fc2a | 445 | char *m; |
bosko001 | 4:7abcf4543282 | 446 | |
bosko001 | 2:45b351b4fc2a | 447 | if (! attr || attr == dxml_NIL) return; // nothing to free |
bosko001 | 2:45b351b4fc2a | 448 | while (attr[i]) i += 2; // find end of attribute list |
bosko001 | 2:45b351b4fc2a | 449 | m = attr[i + 1]; // list of which names and values are malloced |
bosko001 | 2:45b351b4fc2a | 450 | for (i = 0; m[i]; i++) { |
bosko001 | 2:45b351b4fc2a | 451 | if (m[i] & dxml_NAMEM) free(attr[i * 2]); |
bosko001 | 2:45b351b4fc2a | 452 | if (m[i] & dxml_TXTM) free(attr[(i * 2) + 1]); |
bosko001 | 2:45b351b4fc2a | 453 | } |
bosko001 | 2:45b351b4fc2a | 454 | free(m); |
bosko001 | 2:45b351b4fc2a | 455 | free(attr); |
bosko001 | 2:45b351b4fc2a | 456 | } |
bosko001 | 2:45b351b4fc2a | 457 | |
bosko001 | 2:45b351b4fc2a | 458 | // parse the given xml string and return an dxml structure |
bosko001 | 2:45b351b4fc2a | 459 | dxml_t dxml_parse_str(char *s, size_t len) |
bosko001 | 2:45b351b4fc2a | 460 | { |
bosko001 | 2:45b351b4fc2a | 461 | dxml_root_t root = (dxml_root_t)dxml_new(NULL); |
bosko001 | 2:45b351b4fc2a | 462 | char q, e, *d, **attr, **a = NULL; // initialize a to avoid compile warning |
bosko001 | 2:45b351b4fc2a | 463 | int l, i, j; |
bosko001 | 2:45b351b4fc2a | 464 | |
bosko001 | 2:45b351b4fc2a | 465 | root->m = s; |
bosko001 | 2:45b351b4fc2a | 466 | if (! len) return dxml_err(root, NULL, "root tag missing"); |
bosko001 | 2:45b351b4fc2a | 467 | root->u = dxml_str2utf8(&s, &len); // convert utf-16 to utf-8 |
bosko001 | 2:45b351b4fc2a | 468 | root->e = (root->s = s) + len; // record start and end of work area |
bosko001 | 4:7abcf4543282 | 469 | |
bosko001 | 2:45b351b4fc2a | 470 | e = s[len - 1]; // save end char |
bosko001 | 2:45b351b4fc2a | 471 | s[len - 1] = '\0'; // turn end char into null terminator |
bosko001 | 2:45b351b4fc2a | 472 | |
bosko001 | 2:45b351b4fc2a | 473 | while (*s && *s != '<') s++; // find first tag |
bosko001 | 2:45b351b4fc2a | 474 | if (! *s) return dxml_err(root, s, "root tag missing"); |
bosko001 | 2:45b351b4fc2a | 475 | |
bosko001 | 2:45b351b4fc2a | 476 | for (; ; ) { |
bosko001 | 2:45b351b4fc2a | 477 | attr = (char **)dxml_NIL; |
bosko001 | 2:45b351b4fc2a | 478 | d = ++s; |
bosko001 | 4:7abcf4543282 | 479 | |
bosko001 | 2:45b351b4fc2a | 480 | if (isalpha(*s) || *s == '_' || *s == ':' || *s < '\0') { // new tag |
bosko001 | 2:45b351b4fc2a | 481 | if (! root->cur) |
bosko001 | 2:45b351b4fc2a | 482 | return dxml_err(root, d, "markup outside of root element"); |
bosko001 | 2:45b351b4fc2a | 483 | |
bosko001 | 2:45b351b4fc2a | 484 | s += strcspn(s, dxml_WS "/>"); |
bosko001 | 2:45b351b4fc2a | 485 | while (isspace(*s)) *(s++) = '\0'; // null terminate tag name |
bosko001 | 4:7abcf4543282 | 486 | |
bosko001 | 2:45b351b4fc2a | 487 | if (*s && *s != '/' && *s != '>') // find tag in default attr list |
bosko001 | 2:45b351b4fc2a | 488 | for (i = 0; (a = root->attr[i]) && strcmp(a[0], d); i++); |
bosko001 | 2:45b351b4fc2a | 489 | |
bosko001 | 2:45b351b4fc2a | 490 | for (l = 0; *s && *s != '/' && *s != '>'; l += 2) { // new attrib |
bosko001 | 2:45b351b4fc2a | 491 | attr = (char**)((l) ? realloc(attr, (l + 4) * sizeof(char *)) |
bosko001 | 2:45b351b4fc2a | 492 | : malloc(4 * sizeof(char *))); // allocate space |
bosko001 | 2:45b351b4fc2a | 493 | attr[l + 3] = (char*)((l) ? realloc(attr[l + 1], (l / 2) + 2) |
bosko001 | 2:45b351b4fc2a | 494 | : malloc(2)); // mem for list of maloced vals |
bosko001 | 2:45b351b4fc2a | 495 | strcpy(attr[l + 3] + (l / 2), " "); // value is not malloced |
bosko001 | 2:45b351b4fc2a | 496 | attr[l + 2] = NULL; // null terminate list |
bosko001 | 2:45b351b4fc2a | 497 | attr[l + 1] = ""; // temporary attribute value |
bosko001 | 2:45b351b4fc2a | 498 | attr[l] = s; // set attribute name |
bosko001 | 2:45b351b4fc2a | 499 | |
bosko001 | 2:45b351b4fc2a | 500 | s += strcspn(s, dxml_WS "=/>"); |
bosko001 | 4:7abcf4543282 | 501 | if (*s == '=' || isspace(*s)) { |
bosko001 | 2:45b351b4fc2a | 502 | *(s++) = '\0'; // null terminate tag attribute name |
bosko001 | 2:45b351b4fc2a | 503 | q = *(s += strspn(s, dxml_WS "=")); |
bosko001 | 2:45b351b4fc2a | 504 | if (q == '"' || q == '\'') { // attribute value |
bosko001 | 2:45b351b4fc2a | 505 | attr[l + 1] = ++s; |
bosko001 | 2:45b351b4fc2a | 506 | while (*s && *s != q) s++; |
bosko001 | 2:45b351b4fc2a | 507 | if (*s) *(s++) = '\0'; // null terminate attribute val |
bosko001 | 2:45b351b4fc2a | 508 | else { |
bosko001 | 2:45b351b4fc2a | 509 | dxml_free_attr(attr); |
bosko001 | 2:45b351b4fc2a | 510 | return dxml_err(root, d, "missing %c", q); |
bosko001 | 2:45b351b4fc2a | 511 | } |
bosko001 | 2:45b351b4fc2a | 512 | |
bosko001 | 2:45b351b4fc2a | 513 | for (j = 1; a && a[j] && strcmp(a[j], attr[l]); j +=3); |
bosko001 | 2:45b351b4fc2a | 514 | attr[l + 1] = dxml_decode(attr[l + 1], root->ent, (a |
bosko001 | 2:45b351b4fc2a | 515 | && a[j]) ? *a[j + 2] : ' '); |
bosko001 | 2:45b351b4fc2a | 516 | if (attr[l + 1] < d || attr[l + 1] > s) |
bosko001 | 2:45b351b4fc2a | 517 | attr[l + 3][l / 2] = dxml_TXTM; // value malloced |
bosko001 | 2:45b351b4fc2a | 518 | } |
bosko001 | 2:45b351b4fc2a | 519 | } |
bosko001 | 2:45b351b4fc2a | 520 | while (isspace(*s)) s++; |
bosko001 | 2:45b351b4fc2a | 521 | } |
bosko001 | 2:45b351b4fc2a | 522 | |
bosko001 | 2:45b351b4fc2a | 523 | if (*s == '/') { // self closing tag |
bosko001 | 2:45b351b4fc2a | 524 | *(s++) = '\0'; |
bosko001 | 2:45b351b4fc2a | 525 | if ((*s && *s != '>') || (! *s && e != '>')) { |
bosko001 | 2:45b351b4fc2a | 526 | if (l) dxml_free_attr(attr); |
bosko001 | 2:45b351b4fc2a | 527 | return dxml_err(root, d, "missing >"); |
bosko001 | 2:45b351b4fc2a | 528 | } |
bosko001 | 2:45b351b4fc2a | 529 | dxml_open_tag(root, d, attr); |
bosko001 | 2:45b351b4fc2a | 530 | dxml_close_tag(root, d, s); |
bosko001 | 2:45b351b4fc2a | 531 | } |
bosko001 | 2:45b351b4fc2a | 532 | else if ((q = *s) == '>' || (! *s && e == '>')) { // open tag |
bosko001 | 2:45b351b4fc2a | 533 | *s = '\0'; // temporarily null terminate tag name |
bosko001 | 2:45b351b4fc2a | 534 | dxml_open_tag(root, d, attr); |
bosko001 | 2:45b351b4fc2a | 535 | *s = q; |
bosko001 | 2:45b351b4fc2a | 536 | } |
bosko001 | 2:45b351b4fc2a | 537 | else { |
bosko001 | 2:45b351b4fc2a | 538 | if (l) dxml_free_attr(attr); |
bosko001 | 4:7abcf4543282 | 539 | return dxml_err(root, d, "missing >"); |
bosko001 | 2:45b351b4fc2a | 540 | } |
bosko001 | 2:45b351b4fc2a | 541 | } |
bosko001 | 2:45b351b4fc2a | 542 | else if (*s == '/') { // close tag |
bosko001 | 2:45b351b4fc2a | 543 | s += strcspn(d = s + 1, dxml_WS ">") + 1; |
bosko001 | 2:45b351b4fc2a | 544 | if (! (q = *s) && e != '>') return dxml_err(root, d, "missing >"); |
bosko001 | 2:45b351b4fc2a | 545 | *s = '\0'; // temporarily null terminate tag name |
bosko001 | 2:45b351b4fc2a | 546 | if (dxml_close_tag(root, d, s)) return &root->xml; |
bosko001 | 2:45b351b4fc2a | 547 | if (isspace(*s = q)) s += strspn(s, dxml_WS); |
bosko001 | 2:45b351b4fc2a | 548 | } |
bosko001 | 2:45b351b4fc2a | 549 | else if (! strncmp(s, "!--", 3)) { // xml comment |
bosko001 | 2:45b351b4fc2a | 550 | if (! (s = strstr(s + 3, "--")) || (*(s += 2) != '>' && *s) || |
bosko001 | 2:45b351b4fc2a | 551 | (! *s && e != '>')) return dxml_err(root, d, "unclosed <!--"); |
bosko001 | 2:45b351b4fc2a | 552 | } |
bosko001 | 2:45b351b4fc2a | 553 | else if (! strncmp(s, "![CDATA[", 8)) { // cdata |
bosko001 | 2:45b351b4fc2a | 554 | if ((s = strstr(s, "]]>"))) |
bosko001 | 2:45b351b4fc2a | 555 | dxml_char_content(root, d + 8, (s += 2) - d - 10, 'c'); |
bosko001 | 2:45b351b4fc2a | 556 | else return dxml_err(root, d, "unclosed <![CDATA["); |
bosko001 | 2:45b351b4fc2a | 557 | } |
bosko001 | 2:45b351b4fc2a | 558 | else if (! strncmp(s, "!DOCTYPE", 8)) { // dtd |
bosko001 | 4:7abcf4543282 | 559 | for (l = 0; *s && ((! l && *s != '>') || (l && (*s != ']' || |
bosko001 | 2:45b351b4fc2a | 560 | *(s + strspn(s + 1, dxml_WS) + 1) != '>'))); |
bosko001 | 2:45b351b4fc2a | 561 | l = (*s == '[') ? 1 : l) s += strcspn(s + 1, "[]>") + 1; |
bosko001 | 2:45b351b4fc2a | 562 | if (! *s && e != '>') |
bosko001 | 2:45b351b4fc2a | 563 | return dxml_err(root, d, "unclosed <!DOCTYPE"); |
bosko001 | 2:45b351b4fc2a | 564 | d = (l) ? strchr(d, '[') + 1 : d; |
bosko001 | 2:45b351b4fc2a | 565 | if (l && ! dxml_internal_dtd(root, d, s++ - d)) return &root->xml; |
bosko001 | 2:45b351b4fc2a | 566 | } |
bosko001 | 2:45b351b4fc2a | 567 | else if (*s == '?') { // <?...?> processing instructions |
bosko001 | 2:45b351b4fc2a | 568 | do { s = strchr(s, '?'); } while (s && *(++s) && *s != '>'); |
bosko001 | 4:7abcf4543282 | 569 | if (! s || (! *s && e != '>')) |
bosko001 | 2:45b351b4fc2a | 570 | return dxml_err(root, d, "unclosed <?"); |
bosko001 | 2:45b351b4fc2a | 571 | else dxml_proc_inst(root, d + 1, s - d - 2); |
bosko001 | 2:45b351b4fc2a | 572 | } |
bosko001 | 2:45b351b4fc2a | 573 | else return dxml_err(root, d, "unexpected <"); |
bosko001 | 4:7abcf4543282 | 574 | |
bosko001 | 2:45b351b4fc2a | 575 | if (! s || ! *s) break; |
bosko001 | 2:45b351b4fc2a | 576 | *s = '\0'; |
bosko001 | 2:45b351b4fc2a | 577 | d = ++s; |
bosko001 | 2:45b351b4fc2a | 578 | if (*s && *s != '<') { // tag character content |
bosko001 | 2:45b351b4fc2a | 579 | while (*s && *s != '<') s++; |
bosko001 | 2:45b351b4fc2a | 580 | if (*s) dxml_char_content(root, d, s - d, '&'); |
bosko001 | 2:45b351b4fc2a | 581 | else break; |
bosko001 | 2:45b351b4fc2a | 582 | } |
bosko001 | 2:45b351b4fc2a | 583 | else if (! *s) break; |
bosko001 | 2:45b351b4fc2a | 584 | } |
bosko001 | 2:45b351b4fc2a | 585 | |
bosko001 | 2:45b351b4fc2a | 586 | if (! root->cur) return &root->xml; |
bosko001 | 2:45b351b4fc2a | 587 | else if (! root->cur->name) return dxml_err(root, d, "root tag missing"); |
bosko001 | 2:45b351b4fc2a | 588 | else return dxml_err(root, d, "unclosed tag <%s>", root->cur->name); |
bosko001 | 2:45b351b4fc2a | 589 | } |
bosko001 | 2:45b351b4fc2a | 590 | |
bosko001 | 2:45b351b4fc2a | 591 | // Wrapper for dxml_parse_str() that accepts a file stream. Reads the entire |
bosko001 | 2:45b351b4fc2a | 592 | // stream into memory and then parses it. For xml files, use dxml_parse_file() |
bosko001 | 2:45b351b4fc2a | 593 | // or dxml_parse_fd() |
bosko001 | 2:45b351b4fc2a | 594 | dxml_t dxml_parse_fp(FILE *fp) |
bosko001 | 2:45b351b4fc2a | 595 | { |
bosko001 | 2:45b351b4fc2a | 596 | dxml_root_t root; |
bosko001 | 2:45b351b4fc2a | 597 | size_t l, len = 0; |
bosko001 | 2:45b351b4fc2a | 598 | char *s; |
bosko001 | 2:45b351b4fc2a | 599 | |
bosko001 | 2:45b351b4fc2a | 600 | if (! (s = (char*)malloc(dxml_BUFSIZE))) return NULL; |
bosko001 | 2:45b351b4fc2a | 601 | do { |
bosko001 | 2:45b351b4fc2a | 602 | len += (l = fread((s + len), 1, dxml_BUFSIZE, fp)); |
bosko001 | 2:45b351b4fc2a | 603 | if (l == dxml_BUFSIZE) s = (char*)realloc(s, len + dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 604 | } while (s && l == dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 605 | |
bosko001 | 2:45b351b4fc2a | 606 | if (! s) return NULL; |
bosko001 | 2:45b351b4fc2a | 607 | root = (dxml_root_t)dxml_parse_str(s, len); |
bosko001 | 2:45b351b4fc2a | 608 | root->len = -1; // so we know to free s in dxml_free() |
bosko001 | 2:45b351b4fc2a | 609 | return &root->xml; |
bosko001 | 2:45b351b4fc2a | 610 | } |
bosko001 | 2:45b351b4fc2a | 611 | // |
bosko001 | 2:45b351b4fc2a | 612 | //// A wrapper for dxml_parse_str() that accepts a file descriptor. First |
bosko001 | 2:45b351b4fc2a | 613 | //// attempts to mem map the file. Failing that, reads the file into memory. |
bosko001 | 2:45b351b4fc2a | 614 | //// Returns NULL on failure. |
bosko001 | 2:45b351b4fc2a | 615 | //dxml_t dxml_parse_fd(int fd) |
bosko001 | 2:45b351b4fc2a | 616 | //{ |
bosko001 | 2:45b351b4fc2a | 617 | // dxml_root_t root; |
bosko001 | 2:45b351b4fc2a | 618 | // struct stat st; |
bosko001 | 2:45b351b4fc2a | 619 | // size_t l; |
bosko001 | 2:45b351b4fc2a | 620 | // void *m; |
bosko001 | 2:45b351b4fc2a | 621 | // |
bosko001 | 2:45b351b4fc2a | 622 | // if (fd < 0) return NULL; |
bosko001 | 2:45b351b4fc2a | 623 | // fstat(fd, &st); |
bosko001 | 2:45b351b4fc2a | 624 | // |
bosko001 | 2:45b351b4fc2a | 625 | //#ifndef dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 626 | // l = (st.st_size + sysconf(_SC_PAGESIZE) - 1) & ~(sysconf(_SC_PAGESIZE) -1); |
bosko001 | 2:45b351b4fc2a | 627 | // if ((m = mmap(NULL, l, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0)) != |
bosko001 | 2:45b351b4fc2a | 628 | // MAP_FAILED) { |
bosko001 | 2:45b351b4fc2a | 629 | // madvise(m, l, MADV_SEQUENTIAL); // optimize for sequential access |
bosko001 | 2:45b351b4fc2a | 630 | // root = (dxml_root_t)dxml_parse_str(m, st.st_size); |
bosko001 | 2:45b351b4fc2a | 631 | // madvise(m, root->len = l, MADV_NORMAL); // put it back to normal |
bosko001 | 2:45b351b4fc2a | 632 | // } |
bosko001 | 2:45b351b4fc2a | 633 | // else { // mmap failed, read file into memory |
bosko001 | 2:45b351b4fc2a | 634 | //#endif // dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 635 | // l = read(fd, m = malloc(st.st_size), st.st_size); |
bosko001 | 2:45b351b4fc2a | 636 | // root = (dxml_root_t)dxml_parse_str((char*)m, l); |
bosko001 | 2:45b351b4fc2a | 637 | // root->len = -1; // so we know to free s in dxml_free() |
bosko001 | 2:45b351b4fc2a | 638 | //#ifndef dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 639 | // } |
bosko001 | 2:45b351b4fc2a | 640 | //#endif // dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 641 | // return &root->xml; |
bosko001 | 2:45b351b4fc2a | 642 | //} |
bosko001 | 2:45b351b4fc2a | 643 | |
bosko001 | 2:45b351b4fc2a | 644 | // Encodes ampersand sequences appending the results to *dst, reallocating *dst |
bosko001 | 2:45b351b4fc2a | 645 | // if length excedes max. a is non-zero for attribute encoding. Returns *dst |
bosko001 | 2:45b351b4fc2a | 646 | char *dxml_ampencode(const char *s, size_t len, char **dst, size_t *dlen, |
bosko001 | 2:45b351b4fc2a | 647 | size_t *max, short a) |
bosko001 | 2:45b351b4fc2a | 648 | { |
bosko001 | 2:45b351b4fc2a | 649 | const char *e; |
bosko001 | 4:7abcf4543282 | 650 | |
bosko001 | 2:45b351b4fc2a | 651 | for (e = s + len; s != e; s++) { |
bosko001 | 2:45b351b4fc2a | 652 | while (*dlen + 10 > *max) *dst = (char*)realloc(*dst, *max += dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 653 | |
bosko001 | 2:45b351b4fc2a | 654 | switch (*s) { |
bosko001 | 2:45b351b4fc2a | 655 | case '\0': return *dst; |
bosko001 | 2:45b351b4fc2a | 656 | case '&': *dlen += sprintf(*dst + *dlen, "&"); break; |
bosko001 | 2:45b351b4fc2a | 657 | case '<': *dlen += sprintf(*dst + *dlen, "<"); break; |
bosko001 | 2:45b351b4fc2a | 658 | case '>': *dlen += sprintf(*dst + *dlen, ">"); break; |
bosko001 | 2:45b351b4fc2a | 659 | case '"': *dlen += sprintf(*dst + *dlen, (a) ? """ : "\""); break; |
bosko001 | 2:45b351b4fc2a | 660 | case '\n': *dlen += sprintf(*dst + *dlen, (a) ? "
" : "\n"); break; |
bosko001 | 2:45b351b4fc2a | 661 | case '\t': *dlen += sprintf(*dst + *dlen, (a) ? "	" : "\t"); break; |
bosko001 | 2:45b351b4fc2a | 662 | case '\r': *dlen += sprintf(*dst + *dlen, "
"); break; |
bosko001 | 2:45b351b4fc2a | 663 | default: (*dst)[(*dlen)++] = *s; |
bosko001 | 2:45b351b4fc2a | 664 | } |
bosko001 | 2:45b351b4fc2a | 665 | } |
bosko001 | 2:45b351b4fc2a | 666 | return *dst; |
bosko001 | 2:45b351b4fc2a | 667 | } |
bosko001 | 2:45b351b4fc2a | 668 | |
bosko001 | 2:45b351b4fc2a | 669 | // Recursively converts each tag to xml appending it to *s. Reallocates *s if |
bosko001 | 2:45b351b4fc2a | 670 | // its length excedes max. start is the location of the previous tag in the |
bosko001 | 2:45b351b4fc2a | 671 | // parent tag's character content. Returns *s. |
bosko001 | 2:45b351b4fc2a | 672 | char *dxml_toxml_r(dxml_t xml, char **s, size_t *len, size_t *max, |
bosko001 | 2:45b351b4fc2a | 673 | size_t start, char ***attr) |
bosko001 | 2:45b351b4fc2a | 674 | { |
bosko001 | 2:45b351b4fc2a | 675 | int i, j; |
bosko001 | 2:45b351b4fc2a | 676 | char *txt = (xml->parent) ? xml->parent->txt : strdup(""); |
bosko001 | 2:45b351b4fc2a | 677 | size_t off = 0; |
bosko001 | 2:45b351b4fc2a | 678 | |
bosko001 | 2:45b351b4fc2a | 679 | // parent character content up to this tag |
bosko001 | 2:45b351b4fc2a | 680 | *s = dxml_ampencode(txt + start, xml->off - start, s, len, max, 0); |
bosko001 | 2:45b351b4fc2a | 681 | |
bosko001 | 2:45b351b4fc2a | 682 | while (*len + strlen(xml->name) + 4 > *max) // reallocate s |
bosko001 | 2:45b351b4fc2a | 683 | *s = (char*)realloc(*s, *max += dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 684 | |
bosko001 | 2:45b351b4fc2a | 685 | *len += sprintf(*s + *len, "<%s", xml->name); // open tag |
bosko001 | 2:45b351b4fc2a | 686 | for (i = 0; xml->attr[i]; i += 2) { // tag attributes |
bosko001 | 2:45b351b4fc2a | 687 | if (dxml_attr(xml, xml->attr[i]) != xml->attr[i + 1]) continue; |
bosko001 | 2:45b351b4fc2a | 688 | while (*len + strlen(xml->attr[i]) + 7 > *max) // reallocate s |
bosko001 | 2:45b351b4fc2a | 689 | *s = (char*)realloc(*s, *max += dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 690 | |
bosko001 | 2:45b351b4fc2a | 691 | *len += sprintf(*s + *len, " %s=\"", xml->attr[i]); |
bosko001 | 2:45b351b4fc2a | 692 | dxml_ampencode(xml->attr[i + 1], -1, s, len, max, 1); |
bosko001 | 2:45b351b4fc2a | 693 | *len += sprintf(*s + *len, "\""); |
bosko001 | 2:45b351b4fc2a | 694 | } |
bosko001 | 2:45b351b4fc2a | 695 | |
bosko001 | 2:45b351b4fc2a | 696 | for (i = 0; attr[i] && strcmp(attr[i][0], xml->name); i++); |
bosko001 | 2:45b351b4fc2a | 697 | for (j = 1; attr[i] && attr[i][j]; j += 3) { // default attributes |
bosko001 | 2:45b351b4fc2a | 698 | if (! attr[i][j + 1] || dxml_attr(xml, attr[i][j]) != attr[i][j + 1]) |
bosko001 | 2:45b351b4fc2a | 699 | continue; // skip duplicates and non-values |
bosko001 | 2:45b351b4fc2a | 700 | while (*len + strlen(attr[i][j]) + 7 > *max) // reallocate s |
bosko001 | 2:45b351b4fc2a | 701 | *s = (char*)realloc(*s, *max += dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 702 | |
bosko001 | 2:45b351b4fc2a | 703 | *len += sprintf(*s + *len, " %s=\"", attr[i][j]); |
bosko001 | 2:45b351b4fc2a | 704 | dxml_ampencode(attr[i][j + 1], -1, s, len, max, 1); |
bosko001 | 2:45b351b4fc2a | 705 | *len += sprintf(*s + *len, "\""); |
bosko001 | 2:45b351b4fc2a | 706 | } |
bosko001 | 2:45b351b4fc2a | 707 | *len += sprintf(*s + *len, ">"); |
bosko001 | 2:45b351b4fc2a | 708 | |
bosko001 | 2:45b351b4fc2a | 709 | *s = (xml->child) ? dxml_toxml_r(xml->child, s, len, max, 0, attr) //child |
bosko001 | 2:45b351b4fc2a | 710 | : dxml_ampencode(xml->txt, -1, s, len, max, 0); //data |
bosko001 | 4:7abcf4543282 | 711 | |
bosko001 | 2:45b351b4fc2a | 712 | while (*len + strlen(xml->name) + 4 > *max) // reallocate s |
bosko001 | 2:45b351b4fc2a | 713 | *s = (char*)realloc(*s, *max += dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 714 | |
bosko001 | 2:45b351b4fc2a | 715 | *len += sprintf(*s + *len, "</%s>", xml->name); // close tag |
bosko001 | 2:45b351b4fc2a | 716 | |
bosko001 | 2:45b351b4fc2a | 717 | while (txt[off] && off < xml->off) off++; // make sure off is within bounds |
bosko001 | 2:45b351b4fc2a | 718 | return (xml->ordered) ? dxml_toxml_r(xml->ordered, s, len, max, off, attr) |
bosko001 | 2:45b351b4fc2a | 719 | : dxml_ampencode(txt + off, -1, s, len, max, 0); |
bosko001 | 2:45b351b4fc2a | 720 | } |
bosko001 | 2:45b351b4fc2a | 721 | |
bosko001 | 2:45b351b4fc2a | 722 | // Converts an dxml structure back to xml. Returns a string of xml data that |
bosko001 | 2:45b351b4fc2a | 723 | // must be freed. |
bosko001 | 2:45b351b4fc2a | 724 | char *dxml_toxml(dxml_t xml) |
bosko001 | 2:45b351b4fc2a | 725 | { |
bosko001 | 2:45b351b4fc2a | 726 | dxml_t p = (xml) ? xml->parent : NULL, o = (xml) ? xml->ordered : NULL; |
bosko001 | 2:45b351b4fc2a | 727 | dxml_root_t root = (dxml_root_t)xml; |
bosko001 | 2:45b351b4fc2a | 728 | size_t len = 0, max = dxml_BUFSIZE; |
bosko001 | 2:45b351b4fc2a | 729 | char *s = strcpy((char*)malloc(max), ""), *t, *n; |
bosko001 | 2:45b351b4fc2a | 730 | int i, j, k; |
bosko001 | 2:45b351b4fc2a | 731 | |
bosko001 | 2:45b351b4fc2a | 732 | if (! xml || ! xml->name) return (char*)realloc(s, len + 1); |
bosko001 | 2:45b351b4fc2a | 733 | while (root->xml.parent) root = (dxml_root_t)root->xml.parent; // root tag |
bosko001 | 2:45b351b4fc2a | 734 | |
bosko001 | 2:45b351b4fc2a | 735 | for (i = 0; ! p && root->pi[i]; i++) { // pre-root processing instructions |
bosko001 | 2:45b351b4fc2a | 736 | for (k = 2; root->pi[i][k - 1]; k++); |
bosko001 | 2:45b351b4fc2a | 737 | for (j = 1; (n = root->pi[i][j]); j++) { |
bosko001 | 2:45b351b4fc2a | 738 | if (root->pi[i][k][j - 1] == '>') continue; // not pre-root |
bosko001 | 2:45b351b4fc2a | 739 | while (len + strlen(t = root->pi[i][0]) + strlen(n) + 7 > max) |
bosko001 | 2:45b351b4fc2a | 740 | s = (char*)realloc(s, max += dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 741 | len += sprintf(s + len, "<?%s%s%s?>\n", t, *n ? " " : "", n); |
bosko001 | 2:45b351b4fc2a | 742 | } |
bosko001 | 2:45b351b4fc2a | 743 | } |
bosko001 | 2:45b351b4fc2a | 744 | |
bosko001 | 2:45b351b4fc2a | 745 | xml->parent = xml->ordered = NULL; |
bosko001 | 2:45b351b4fc2a | 746 | s = dxml_toxml_r(xml, &s, &len, &max, 0, root->attr); |
bosko001 | 2:45b351b4fc2a | 747 | xml->parent = p; |
bosko001 | 2:45b351b4fc2a | 748 | xml->ordered = o; |
bosko001 | 2:45b351b4fc2a | 749 | |
bosko001 | 2:45b351b4fc2a | 750 | for (i = 0; ! p && root->pi[i]; i++) { // post-root processing instructions |
bosko001 | 2:45b351b4fc2a | 751 | for (k = 2; root->pi[i][k - 1]; k++); |
bosko001 | 2:45b351b4fc2a | 752 | for (j = 1; (n = root->pi[i][j]); j++) { |
bosko001 | 2:45b351b4fc2a | 753 | if (root->pi[i][k][j - 1] == '<') continue; // not post-root |
bosko001 | 2:45b351b4fc2a | 754 | while (len + strlen(t = root->pi[i][0]) + strlen(n) + 7 > max) |
bosko001 | 2:45b351b4fc2a | 755 | s = (char*)realloc(s, max += dxml_BUFSIZE); |
bosko001 | 2:45b351b4fc2a | 756 | len += sprintf(s + len, "\n<?%s%s%s?>", t, *n ? " " : "", n); |
bosko001 | 2:45b351b4fc2a | 757 | } |
bosko001 | 2:45b351b4fc2a | 758 | } |
bosko001 | 2:45b351b4fc2a | 759 | return (char*)realloc(s, len + 1); |
bosko001 | 2:45b351b4fc2a | 760 | } |
bosko001 | 2:45b351b4fc2a | 761 | |
bosko001 | 2:45b351b4fc2a | 762 | // free the memory allocated for the dxml structure |
bosko001 | 2:45b351b4fc2a | 763 | void dxml_free(dxml_t xml) |
bosko001 | 2:45b351b4fc2a | 764 | { |
bosko001 | 2:45b351b4fc2a | 765 | dxml_root_t root = (dxml_root_t)xml; |
bosko001 | 2:45b351b4fc2a | 766 | int i, j; |
bosko001 | 2:45b351b4fc2a | 767 | char **a, *s; |
bosko001 | 2:45b351b4fc2a | 768 | |
bosko001 | 2:45b351b4fc2a | 769 | if (! xml) return; |
bosko001 | 2:45b351b4fc2a | 770 | dxml_free(xml->child); |
bosko001 | 2:45b351b4fc2a | 771 | dxml_free(xml->ordered); |
bosko001 | 2:45b351b4fc2a | 772 | |
bosko001 | 2:45b351b4fc2a | 773 | if (! xml->parent) { // free root tag allocations |
bosko001 | 2:45b351b4fc2a | 774 | for (i = 10; root->ent[i]; i += 2) // 0 - 9 are default entites (<>&"') |
bosko001 | 2:45b351b4fc2a | 775 | if ((s = root->ent[i + 1]) < root->s || s > root->e) free(s); |
bosko001 | 2:45b351b4fc2a | 776 | free(root->ent); // free list of general entities |
bosko001 | 2:45b351b4fc2a | 777 | |
bosko001 | 2:45b351b4fc2a | 778 | for (i = 0; (a = root->attr[i]); i++) { |
bosko001 | 2:45b351b4fc2a | 779 | for (j = 1; a[j++]; j += 2) // free malloced attribute values |
bosko001 | 2:45b351b4fc2a | 780 | if (a[j] && (a[j] < root->s || a[j] > root->e)) free(a[j]); |
bosko001 | 2:45b351b4fc2a | 781 | free(a); |
bosko001 | 2:45b351b4fc2a | 782 | } |
bosko001 | 2:45b351b4fc2a | 783 | if (root->attr[0]) free(root->attr); // free default attribute list |
bosko001 | 2:45b351b4fc2a | 784 | |
bosko001 | 2:45b351b4fc2a | 785 | for (i = 0; root->pi[i]; i++) { |
bosko001 | 2:45b351b4fc2a | 786 | for (j = 1; root->pi[i][j]; j++); |
bosko001 | 2:45b351b4fc2a | 787 | free(root->pi[i][j + 1]); |
bosko001 | 2:45b351b4fc2a | 788 | free(root->pi[i]); |
bosko001 | 4:7abcf4543282 | 789 | } |
bosko001 | 2:45b351b4fc2a | 790 | if (root->pi[0]) free(root->pi); // free processing instructions |
bosko001 | 2:45b351b4fc2a | 791 | |
bosko001 | 2:45b351b4fc2a | 792 | if (root->len == -1) free(root->m); // malloced xml data |
bosko001 | 2:45b351b4fc2a | 793 | #ifndef dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 794 | else if (root->len) munmap(root->m, root->len); // mem mapped xml data |
bosko001 | 2:45b351b4fc2a | 795 | #endif // dxml_NOMMAP |
bosko001 | 2:45b351b4fc2a | 796 | if (root->u) free(root->u); // utf8 conversion |
bosko001 | 2:45b351b4fc2a | 797 | } |
bosko001 | 2:45b351b4fc2a | 798 | |
bosko001 | 2:45b351b4fc2a | 799 | dxml_free_attr(xml->attr); // tag attributes |
bosko001 | 2:45b351b4fc2a | 800 | if ((xml->flags & dxml_TXTM)) free(xml->txt); // character content |
bosko001 | 2:45b351b4fc2a | 801 | if ((xml->flags & dxml_NAMEM)) free(xml->name); // tag name |
bosko001 | 2:45b351b4fc2a | 802 | free(xml); |
bosko001 | 2:45b351b4fc2a | 803 | } |
bosko001 | 2:45b351b4fc2a | 804 | |
bosko001 | 2:45b351b4fc2a | 805 | // return parser error message or empty string if none |
bosko001 | 2:45b351b4fc2a | 806 | const char *dxml_error(dxml_t xml) |
bosko001 | 2:45b351b4fc2a | 807 | { |
bosko001 | 2:45b351b4fc2a | 808 | while (xml && xml->parent) xml = xml->parent; // find root tag |
bosko001 | 2:45b351b4fc2a | 809 | return (xml) ? ((dxml_root_t)xml)->err : ""; |
bosko001 | 2:45b351b4fc2a | 810 | } |
bosko001 | 2:45b351b4fc2a | 811 | |
bosko001 | 2:45b351b4fc2a | 812 | // returns a new empty dxml structure with the given root tag name |
bosko001 | 2:45b351b4fc2a | 813 | dxml_t dxml_new(const char *name) |
bosko001 | 2:45b351b4fc2a | 814 | { |
bosko001 | 2:45b351b4fc2a | 815 | static char *ent[] = { "lt;", "<", "gt;", ">", "quot;", """, |
bosko001 | 2:45b351b4fc2a | 816 | "apos;", "'", "amp;", "&", NULL }; |
bosko001 | 4:7abcf4543282 | 817 | dxml_root_t root = (dxml_root_t)memset(malloc(sizeof(struct dxml_root)), |
bosko001 | 2:45b351b4fc2a | 818 | '\0', sizeof(struct dxml_root)); |
bosko001 | 2:45b351b4fc2a | 819 | root->xml.name = (char *)name; |
bosko001 | 2:45b351b4fc2a | 820 | root->cur = &root->xml; |
bosko001 | 2:45b351b4fc2a | 821 | strcpy(root->err, root->xml.txt = ""); |
bosko001 | 2:45b351b4fc2a | 822 | root->ent = (char**)memcpy(malloc(sizeof(ent)), ent, sizeof(ent)); |
bosko001 | 2:45b351b4fc2a | 823 | root->attr = root->pi = (char ***)(root->xml.attr = dxml_NIL); |
bosko001 | 2:45b351b4fc2a | 824 | return &root->xml; |
bosko001 | 2:45b351b4fc2a | 825 | } |
bosko001 | 2:45b351b4fc2a | 826 | |
bosko001 | 2:45b351b4fc2a | 827 | // inserts an existing tag into an dxml structure |
bosko001 | 2:45b351b4fc2a | 828 | dxml_t dxml_insert(dxml_t xml, dxml_t dest, size_t off) |
bosko001 | 2:45b351b4fc2a | 829 | { |
bosko001 | 2:45b351b4fc2a | 830 | dxml_t cur, prev, head; |
bosko001 | 2:45b351b4fc2a | 831 | |
bosko001 | 2:45b351b4fc2a | 832 | xml->next = xml->sibling = xml->ordered = NULL; |
bosko001 | 2:45b351b4fc2a | 833 | xml->off = off; |
bosko001 | 2:45b351b4fc2a | 834 | xml->parent = dest; |
bosko001 | 2:45b351b4fc2a | 835 | |
bosko001 | 2:45b351b4fc2a | 836 | if ((head = dest->child)) { // already have sub tags |
bosko001 | 2:45b351b4fc2a | 837 | if (head->off <= off) { // not first subtag |
bosko001 | 2:45b351b4fc2a | 838 | for (cur = head; cur->ordered && cur->ordered->off <= off; |
bosko001 | 2:45b351b4fc2a | 839 | cur = cur->ordered); |
bosko001 | 2:45b351b4fc2a | 840 | xml->ordered = cur->ordered; |
bosko001 | 2:45b351b4fc2a | 841 | cur->ordered = xml; |
bosko001 | 2:45b351b4fc2a | 842 | } |
bosko001 | 2:45b351b4fc2a | 843 | else { // first subtag |
bosko001 | 2:45b351b4fc2a | 844 | xml->ordered = head; |
bosko001 | 2:45b351b4fc2a | 845 | dest->child = xml; |
bosko001 | 2:45b351b4fc2a | 846 | } |
bosko001 | 2:45b351b4fc2a | 847 | |
bosko001 | 2:45b351b4fc2a | 848 | for (cur = head, prev = NULL; cur && strcmp(cur->name, xml->name); |
bosko001 | 2:45b351b4fc2a | 849 | prev = cur, cur = cur->sibling); // find tag type |
bosko001 | 2:45b351b4fc2a | 850 | if (cur && cur->off <= off) { // not first of type |
bosko001 | 2:45b351b4fc2a | 851 | while (cur->next && cur->next->off <= off) cur = cur->next; |
bosko001 | 2:45b351b4fc2a | 852 | xml->next = cur->next; |
bosko001 | 2:45b351b4fc2a | 853 | cur->next = xml; |
bosko001 | 2:45b351b4fc2a | 854 | } |
bosko001 | 2:45b351b4fc2a | 855 | else { // first tag of this type |
bosko001 | 2:45b351b4fc2a | 856 | if (prev && cur) prev->sibling = cur->sibling; // remove old first |
bosko001 | 2:45b351b4fc2a | 857 | xml->next = cur; // old first tag is now next |
bosko001 | 2:45b351b4fc2a | 858 | for (cur = head, prev = NULL; cur && cur->off <= off; |
bosko001 | 2:45b351b4fc2a | 859 | prev = cur, cur = cur->sibling); // new sibling insert point |
bosko001 | 2:45b351b4fc2a | 860 | xml->sibling = cur; |
bosko001 | 2:45b351b4fc2a | 861 | if (prev) prev->sibling = xml; |
bosko001 | 2:45b351b4fc2a | 862 | } |
bosko001 | 2:45b351b4fc2a | 863 | } |
bosko001 | 2:45b351b4fc2a | 864 | else dest->child = xml; // only sub tag |
bosko001 | 2:45b351b4fc2a | 865 | |
bosko001 | 2:45b351b4fc2a | 866 | return xml; |
bosko001 | 2:45b351b4fc2a | 867 | } |
bosko001 | 2:45b351b4fc2a | 868 | |
bosko001 | 2:45b351b4fc2a | 869 | // Adds a child tag. off is the offset of the child tag relative to the start |
bosko001 | 2:45b351b4fc2a | 870 | // of the parent tag's character content. Returns the child tag. |
bosko001 | 2:45b351b4fc2a | 871 | dxml_t dxml_add_child(dxml_t xml, const char *name, size_t off) |
bosko001 | 2:45b351b4fc2a | 872 | { |
bosko001 | 2:45b351b4fc2a | 873 | dxml_t child; |
bosko001 | 2:45b351b4fc2a | 874 | |
bosko001 | 2:45b351b4fc2a | 875 | if (! xml) return NULL; |
bosko001 | 2:45b351b4fc2a | 876 | child = (dxml_t)memset(malloc(sizeof(struct dxml)), '\0', |
bosko001 | 2:45b351b4fc2a | 877 | sizeof(struct dxml)); |
bosko001 | 2:45b351b4fc2a | 878 | child->name = (char *)name; |
bosko001 | 2:45b351b4fc2a | 879 | child->attr = dxml_NIL; |
bosko001 | 2:45b351b4fc2a | 880 | child->txt = ""; |
bosko001 | 2:45b351b4fc2a | 881 | |
bosko001 | 2:45b351b4fc2a | 882 | return dxml_insert(child, xml, off); |
bosko001 | 2:45b351b4fc2a | 883 | } |
bosko001 | 2:45b351b4fc2a | 884 | |
bosko001 | 2:45b351b4fc2a | 885 | // sets the character content for the given tag and returns the tag |
bosko001 | 2:45b351b4fc2a | 886 | dxml_t dxml_set_txt(dxml_t xml, const char *txt) |
bosko001 | 2:45b351b4fc2a | 887 | { |
bosko001 | 2:45b351b4fc2a | 888 | if (! xml) return NULL; |
bosko001 | 2:45b351b4fc2a | 889 | if (xml->flags & dxml_TXTM) free(xml->txt); // existing txt was malloced |
bosko001 | 2:45b351b4fc2a | 890 | xml->flags &= ~dxml_TXTM; |
bosko001 | 2:45b351b4fc2a | 891 | xml->txt = (char *)txt; |
bosko001 | 2:45b351b4fc2a | 892 | return xml; |
bosko001 | 2:45b351b4fc2a | 893 | } |
bosko001 | 2:45b351b4fc2a | 894 | |
bosko001 | 2:45b351b4fc2a | 895 | // Sets the given tag attribute or adds a new attribute if not found. A value |
bosko001 | 2:45b351b4fc2a | 896 | // of NULL will remove the specified attribute. Returns the tag given. |
bosko001 | 2:45b351b4fc2a | 897 | dxml_t dxml_set_attr(dxml_t xml, const char *name, const char *value) |
bosko001 | 2:45b351b4fc2a | 898 | { |
bosko001 | 2:45b351b4fc2a | 899 | int l = 0, c; |
bosko001 | 2:45b351b4fc2a | 900 | |
bosko001 | 2:45b351b4fc2a | 901 | if (! xml) return NULL; |
bosko001 | 2:45b351b4fc2a | 902 | while (xml->attr[l] && strcmp(xml->attr[l], name)) l += 2; |
bosko001 | 2:45b351b4fc2a | 903 | if (! xml->attr[l]) { // not found, add as new attribute |
bosko001 | 2:45b351b4fc2a | 904 | if (! value) return xml; // nothing to do |
bosko001 | 2:45b351b4fc2a | 905 | if (xml->attr == dxml_NIL) { // first attribute |
bosko001 | 2:45b351b4fc2a | 906 | xml->attr = (char**)malloc(4 * sizeof(char *)); |
bosko001 | 2:45b351b4fc2a | 907 | xml->attr[1] = strdup(""); // empty list of malloced names/vals |
bosko001 | 2:45b351b4fc2a | 908 | } |
bosko001 | 2:45b351b4fc2a | 909 | else xml->attr = (char**)realloc(xml->attr, (l + 4) * sizeof(char *)); |
bosko001 | 2:45b351b4fc2a | 910 | |
bosko001 | 2:45b351b4fc2a | 911 | xml->attr[l] = (char *)name; // set attribute name |
bosko001 | 2:45b351b4fc2a | 912 | xml->attr[l + 2] = NULL; // null terminate attribute list |
bosko001 | 2:45b351b4fc2a | 913 | xml->attr[l + 3] = (char*)realloc(xml->attr[l + 1], |
bosko001 | 2:45b351b4fc2a | 914 | (c = strlen(xml->attr[l + 1])) + 2); |
bosko001 | 2:45b351b4fc2a | 915 | strcpy(xml->attr[l + 3] + c, " "); // set name/value as not malloced |
bosko001 | 2:45b351b4fc2a | 916 | if (xml->flags & dxml_DUP) xml->attr[l + 3][c] = dxml_NAMEM; |
bosko001 | 2:45b351b4fc2a | 917 | } |
bosko001 | 2:45b351b4fc2a | 918 | else if (xml->flags & dxml_DUP) free((char *)name); // name was strduped |
bosko001 | 2:45b351b4fc2a | 919 | |
bosko001 | 2:45b351b4fc2a | 920 | for (c = l; xml->attr[c]; c += 2); // find end of attribute list |
bosko001 | 2:45b351b4fc2a | 921 | if (xml->attr[c + 1][l / 2] & dxml_TXTM) free(xml->attr[l + 1]); //old val |
bosko001 | 2:45b351b4fc2a | 922 | if (xml->flags & dxml_DUP) xml->attr[c + 1][l / 2] |= dxml_TXTM; |
bosko001 | 2:45b351b4fc2a | 923 | else xml->attr[c + 1][l / 2] &= ~dxml_TXTM; |
bosko001 | 2:45b351b4fc2a | 924 | |
bosko001 | 2:45b351b4fc2a | 925 | if (value) xml->attr[l + 1] = (char *)value; // set attribute value |
bosko001 | 2:45b351b4fc2a | 926 | else { // remove attribute |
bosko001 | 2:45b351b4fc2a | 927 | if (xml->attr[c + 1][l / 2] & dxml_NAMEM) free(xml->attr[l]); |
bosko001 | 4:7abcf4543282 | 928 | // memmove(xml->attr + l, xml->attr + l + 2, (c - l + 2) * sizeof(char*)); |
bosko001 | 4:7abcf4543282 | 929 | memmove(xml->attr + l, xml->attr + l + 2, (c - l) * sizeof(char*)); // promenio BL!!! |
bosko001 | 4:7abcf4543282 | 930 | c -= 2; // dodao BL!!! |
bosko001 | 2:45b351b4fc2a | 931 | xml->attr = (char**)realloc(xml->attr, (c + 2) * sizeof(char *)); |
bosko001 | 2:45b351b4fc2a | 932 | memmove(xml->attr[c + 1] + (l / 2), xml->attr[c + 1] + (l / 2) + 1, |
bosko001 | 4:7abcf4543282 | 933 | /*promenio BL!!! */ ((c + 2)/2) - (l / 2) /*(c / 2) - (l / 2)*/); // fix list of which name/vals are malloced |
bosko001 | 2:45b351b4fc2a | 934 | } |
bosko001 | 2:45b351b4fc2a | 935 | xml->flags &= ~dxml_DUP; // clear strdup() flag |
bosko001 | 2:45b351b4fc2a | 936 | return xml; |
bosko001 | 2:45b351b4fc2a | 937 | } |
bosko001 | 2:45b351b4fc2a | 938 | |
bosko001 | 2:45b351b4fc2a | 939 | // sets a flag for the given tag and returns the tag |
bosko001 | 2:45b351b4fc2a | 940 | dxml_t dxml_set_flag(dxml_t xml, short flag) |
bosko001 | 2:45b351b4fc2a | 941 | { |
bosko001 | 2:45b351b4fc2a | 942 | if (xml) xml->flags |= flag; |
bosko001 | 2:45b351b4fc2a | 943 | return xml; |
bosko001 | 2:45b351b4fc2a | 944 | } |
bosko001 | 2:45b351b4fc2a | 945 | |
bosko001 | 2:45b351b4fc2a | 946 | // removes a tag along with its subtags without freeing its memory |
bosko001 | 2:45b351b4fc2a | 947 | dxml_t dxml_cut(dxml_t xml) |
bosko001 | 2:45b351b4fc2a | 948 | { |
bosko001 | 2:45b351b4fc2a | 949 | dxml_t cur; |
bosko001 | 2:45b351b4fc2a | 950 | |
bosko001 | 2:45b351b4fc2a | 951 | if (! xml) return NULL; // nothing to do |
bosko001 | 2:45b351b4fc2a | 952 | if (xml->next) xml->next->sibling = xml->sibling; // patch sibling list |
bosko001 | 2:45b351b4fc2a | 953 | |
bosko001 | 2:45b351b4fc2a | 954 | if (xml->parent) { // not root tag |
bosko001 | 2:45b351b4fc2a | 955 | cur = xml->parent->child; // find head of subtag list |
bosko001 | 2:45b351b4fc2a | 956 | if (cur == xml) xml->parent->child = xml->ordered; // first subtag |
bosko001 | 2:45b351b4fc2a | 957 | else { // not first subtag |
bosko001 | 2:45b351b4fc2a | 958 | while (cur->ordered != xml) cur = cur->ordered; |
bosko001 | 2:45b351b4fc2a | 959 | cur->ordered = cur->ordered->ordered; // patch ordered list |
bosko001 | 2:45b351b4fc2a | 960 | |
bosko001 | 2:45b351b4fc2a | 961 | cur = xml->parent->child; // go back to head of subtag list |
bosko001 | 2:45b351b4fc2a | 962 | if (strcmp(cur->name, xml->name)) { // not in first sibling list |
bosko001 | 2:45b351b4fc2a | 963 | while (strcmp(cur->sibling->name, xml->name)) |
bosko001 | 2:45b351b4fc2a | 964 | cur = cur->sibling; |
bosko001 | 2:45b351b4fc2a | 965 | if (cur->sibling == xml) { // first of a sibling list |
bosko001 | 2:45b351b4fc2a | 966 | cur->sibling = (xml->next) ? xml->next |
bosko001 | 2:45b351b4fc2a | 967 | : cur->sibling->sibling; |
bosko001 | 2:45b351b4fc2a | 968 | } |
bosko001 | 2:45b351b4fc2a | 969 | else cur = cur->sibling; // not first of a sibling list |
bosko001 | 2:45b351b4fc2a | 970 | } |
bosko001 | 2:45b351b4fc2a | 971 | |
bosko001 | 2:45b351b4fc2a | 972 | while (cur->next && cur->next != xml) cur = cur->next; |
bosko001 | 2:45b351b4fc2a | 973 | if (cur->next) cur->next = cur->next->next; // patch next list |
bosko001 | 4:7abcf4543282 | 974 | } |
bosko001 | 2:45b351b4fc2a | 975 | } |
bosko001 | 2:45b351b4fc2a | 976 | xml->ordered = xml->sibling = xml->next = NULL; |
bosko001 | 2:45b351b4fc2a | 977 | return xml; |
bosko001 | 2:45b351b4fc2a | 978 | } |
bosko001 | 2:45b351b4fc2a | 979 | |
bosko001 | 2:45b351b4fc2a | 980 | #ifdef dxml_TEST // test harness |
bosko001 | 2:45b351b4fc2a | 981 | int main(int argc, char **argv) |
bosko001 | 2:45b351b4fc2a | 982 | { |
bosko001 | 2:45b351b4fc2a | 983 | dxml_t xml; |
bosko001 | 2:45b351b4fc2a | 984 | char *s; |
bosko001 | 2:45b351b4fc2a | 985 | int i; |
bosko001 | 2:45b351b4fc2a | 986 | |
bosko001 | 2:45b351b4fc2a | 987 | if (argc != 2) return fprintf(stderr, "usage: %s xmlfile\n", argv[0]); |
bosko001 | 2:45b351b4fc2a | 988 | |
bosko001 | 2:45b351b4fc2a | 989 | xml = dxml_parse_file(argv[1]); |
bosko001 | 2:45b351b4fc2a | 990 | printf("%s\n", (s = dxml_toxml(xml))); |
bosko001 | 2:45b351b4fc2a | 991 | free(s); |
bosko001 | 2:45b351b4fc2a | 992 | i = fprintf(stderr, "%s", dxml_error(xml)); |
bosko001 | 2:45b351b4fc2a | 993 | dxml_free(xml); |
bosko001 | 2:45b351b4fc2a | 994 | return (i) ? 1 : 0; |
bosko001 | 2:45b351b4fc2a | 995 | } |
bosko001 | 2:45b351b4fc2a | 996 | #endif // dxml_TEST |
bosko001 | 4:7abcf4543282 | 997 |