Wim van der Vegt / TINYXML

Dependents:   tinyxml_test

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers tinyxmlparser.cpp Source File

tinyxmlparser.cpp

00001 /*
00002 www.sourceforge.net/projects/tinyxml
00003 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
00004 
00005 This software is provided 'as-is', without any express or implied 
00006 warranty. In no event will the authors be held liable for any 
00007 damages arising from the use of this software.
00008 
00009 Permission is granted to anyone to use this software for any 
00010 purpose, including commercial applications, and to alter it and 
00011 redistribute it freely, subject to the following restrictions:
00012 
00013 1. The origin of this software must not be misrepresented; you must 
00014 not claim that you wrote the original software. If you use this
00015 software in a product, an acknowledgment in the product documentation
00016 would be appreciated but is not required.
00017 
00018 2. Altered source versions must be plainly marked as such, and 
00019 must not be misrepresented as being the original software.
00020 
00021 3. This notice may not be removed or altered from any source 
00022 distribution.
00023 */
00024 
00025 #include <ctype.h>
00026 #include <stddef.h>
00027 
00028 #include "tinyxml.h"
00029 
00030 //#define DEBUG_PARSER
00031 #if defined( DEBUG_PARSER )
00032 #    if defined( DEBUG ) && defined( _MSC_VER )
00033 #        include <windows.h>
00034 #        define TIXML_LOG OutputDebugString
00035 #    else
00036 #        define TIXML_LOG printf
00037 #    endif
00038 #endif
00039 
00040 // Note tha "PutString" hardcodes the same list. This
00041 // is less flexible than it appears. Changing the entries
00042 // or order will break putstring.    
00043 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = 
00044 {
00045     { "&amp;",  5, '&' },
00046     { "&lt;",   4, '<' },
00047     { "&gt;",   4, '>' },
00048     { "&quot;", 6, '\"' },
00049     { "&apos;", 6, '\'' }
00050 };
00051 
00052 // Bunch of unicode info at:
00053 //        http://www.unicode.org/faq/utf_bom.html
00054 // Including the basic of this table, which determines the #bytes in the
00055 // sequence from the lead byte. 1 placed for invalid sequences --
00056 // although the result will be junk, pass it through as much as possible.
00057 // Beware of the non-characters in UTF-8:    
00058 //                ef bb bf (Microsoft "lead bytes")
00059 //                ef bf be
00060 //                ef bf bf 
00061 
00062 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
00063 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
00064 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
00065 
00066 const int TiXmlBase::utf8ByteTable[256] = 
00067 {
00068     //    0    1    2    3    4    5    6    7    8    9    a    b    c    d    e    f
00069         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x00
00070         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x10
00071         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x20
00072         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x30
00073         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x40
00074         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x50
00075         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x60
00076         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x70    End of ASCII range
00077         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x80 0x80 to 0xc1 invalid
00078         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0x90 
00079         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0xa0 
00080         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    // 0xb0 
00081         1,    1,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    // 0xc0 0xc2 to 0xdf 2 byte
00082         2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    // 0xd0
00083         3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    // 0xe0 0xe0 to 0xef 3 byte
00084         4,    4,    4,    4,    4,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1    // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
00085 };
00086 
00087 
00088 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
00089 {
00090     const unsigned long BYTE_MASK = 0xBF;
00091     const unsigned long BYTE_MARK = 0x80;
00092     const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00093 
00094     if (input < 0x80) 
00095         *length = 1;
00096     else if ( input < 0x800 )
00097         *length = 2;
00098     else if ( input < 0x10000 )
00099         *length = 3;
00100     else if ( input < 0x200000 )
00101         *length = 4;
00102     else
00103         { *length = 0; return; }    // This code won't covert this correctly anyway.
00104 
00105     output += *length;
00106 
00107     // Scary scary fall throughs.
00108     switch (*length) 
00109     {
00110         case 4:
00111             --output; 
00112             *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00113             input >>= 6;
00114         case 3:
00115             --output; 
00116             *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00117             input >>= 6;
00118         case 2:
00119             --output; 
00120             *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00121             input >>= 6;
00122         case 1:
00123             --output; 
00124             *output = (char)(input | FIRST_BYTE_MARK[*length]);
00125     }
00126 }
00127 
00128 
00129 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00130 {
00131     // This will only work for low-ascii, everything else is assumed to be a valid
00132     // letter. I'm not sure this is the best approach, but it is quite tricky trying
00133     // to figure out alhabetical vs. not across encoding. So take a very 
00134     // conservative approach.
00135 
00136 //    if ( encoding == TIXML_ENCODING_UTF8 )
00137 //    {
00138         if ( anyByte < 127 )
00139             return isalpha( anyByte );
00140         else
00141             return 1;    // What else to do? The unicode set is huge...get the english ones right.
00142 //    }
00143 //    else
00144 //    {
00145 //        return isalpha( anyByte );
00146 //    }
00147 }
00148 
00149 
00150 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00151 {
00152     // This will only work for low-ascii, everything else is assumed to be a valid
00153     // letter. I'm not sure this is the best approach, but it is quite tricky trying
00154     // to figure out alhabetical vs. not across encoding. So take a very 
00155     // conservative approach.
00156 
00157 //    if ( encoding == TIXML_ENCODING_UTF8 )
00158 //    {
00159         if ( anyByte < 127 )
00160             return isalnum( anyByte );
00161         else
00162             return 1;    // What else to do? The unicode set is huge...get the english ones right.
00163 //    }
00164 //    else
00165 //    {
00166 //        return isalnum( anyByte );
00167 //    }
00168 }
00169 
00170 
00171 class TiXmlParsingData
00172 {
00173     friend class TiXmlDocument;
00174   public:
00175     void Stamp( const char* now, TiXmlEncoding encoding );
00176 
00177     const TiXmlCursor& Cursor()    { return cursor; }
00178 
00179   private:
00180     // Only used by the document!
00181     TiXmlParsingData( const char* start, int _tabsize, int row, int col )
00182     {
00183         assert( start );
00184         stamp = start;
00185         tabsize = _tabsize;
00186         cursor.row = row;
00187         cursor.col = col;
00188     }
00189 
00190     TiXmlCursor        cursor;
00191     const char*        stamp;
00192     int                tabsize;
00193 };
00194 
00195 
00196 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
00197 {
00198     assert( now );
00199 
00200     // Do nothing if the tabsize is 0.
00201     if ( tabsize < 1 )
00202     {
00203         return;
00204     }
00205 
00206     // Get the current row, column.
00207     int row = cursor.row;
00208     int col = cursor.col;
00209     const char* p = stamp;
00210     assert( p );
00211 
00212     while ( p < now )
00213     {
00214         // Treat p as unsigned, so we have a happy compiler.
00215         const unsigned char* pU = (const unsigned char*)p;
00216 
00217         // Code contributed by Fletcher Dunn: (modified by lee)
00218         switch (*pU) {
00219             case 0:
00220                 // We *should* never get here, but in case we do, don't
00221                 // advance past the terminating null character, ever
00222                 return;
00223 
00224             case '\r':
00225                 // bump down to the next line
00226                 ++row;
00227                 col = 0;                
00228                 // Eat the character
00229                 ++p;
00230 
00231                 // Check for \r\n sequence, and treat this as a single character
00232                 if (*p == '\n') {
00233                     ++p;
00234                 }
00235                 break;
00236 
00237             case '\n':
00238                 // bump down to the next line
00239                 ++row;
00240                 col = 0;
00241 
00242                 // Eat the character
00243                 ++p;
00244 
00245                 // Check for \n\r sequence, and treat this as a single
00246                 // character.  (Yes, this bizarre thing does occur still
00247                 // on some arcane platforms...)
00248                 if (*p == '\r') {
00249                     ++p;
00250                 }
00251                 break;
00252 
00253             case '\t':
00254                 // Eat the character
00255                 ++p;
00256 
00257                 // Skip to next tab stop
00258                 col = (col / tabsize + 1) * tabsize;
00259                 break;
00260 
00261             case TIXML_UTF_LEAD_0:
00262                 if ( encoding == TIXML_ENCODING_UTF8 )
00263                 {
00264                     if ( *(p+1) && *(p+2) )
00265                     {
00266                         // In these cases, don't advance the column. These are
00267                         // 0-width spaces.
00268                         if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
00269                             p += 3;    
00270                         else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
00271                             p += 3;    
00272                         else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
00273                             p += 3;    
00274                         else
00275                             { p +=3; ++col; }    // A normal character.
00276                     }
00277                 }
00278                 else
00279                 {
00280                     ++p;
00281                     ++col;
00282                 }
00283                 break;
00284 
00285             default:
00286                 if ( encoding == TIXML_ENCODING_UTF8 )
00287                 {
00288                     // Eat the 1 to 4 byte utf8 character.
00289                     int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
00290                     if ( step == 0 )
00291                         step = 1;        // Error case from bad encoding, but handle gracefully.
00292                     p += step;
00293 
00294                     // Just advance one column, of course.
00295                     ++col;
00296                 }
00297                 else
00298                 {
00299                     ++p;
00300                     ++col;
00301                 }
00302                 break;
00303         }
00304     }
00305     cursor.row = row;
00306     cursor.col = col;
00307     assert( cursor.row >= -1 );
00308     assert( cursor.col >= -1 );
00309     stamp = p;
00310     assert( stamp );
00311 }
00312 
00313 
00314 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
00315 {
00316     if ( !p || !*p )
00317     {
00318         return 0;
00319     }
00320     if ( encoding == TIXML_ENCODING_UTF8 )
00321     {
00322         while ( *p )
00323         {
00324             const unsigned char* pU = (const unsigned char*)p;
00325             
00326             // Skip the stupid Microsoft UTF-8 Byte order marks
00327             if (    *(pU+0)==TIXML_UTF_LEAD_0
00328                  && *(pU+1)==TIXML_UTF_LEAD_1 
00329                  && *(pU+2)==TIXML_UTF_LEAD_2 )
00330             {
00331                 p += 3;
00332                 continue;
00333             }
00334             else if(*(pU+0)==TIXML_UTF_LEAD_0
00335                  && *(pU+1)==0xbfU
00336                  && *(pU+2)==0xbeU )
00337             {
00338                 p += 3;
00339                 continue;
00340             }
00341             else if(*(pU+0)==TIXML_UTF_LEAD_0
00342                  && *(pU+1)==0xbfU
00343                  && *(pU+2)==0xbfU )
00344             {
00345                 p += 3;
00346                 continue;
00347             }
00348 
00349             if ( IsWhiteSpace( *p ) )        // Still using old rules for white space.
00350                 ++p;
00351             else
00352                 break;
00353         }
00354     }
00355     else
00356     {
00357         while ( *p && IsWhiteSpace( *p ) )
00358             ++p;
00359     }
00360 
00361     return p;
00362 }
00363 
00364 #ifdef TIXML_USE_STL
00365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
00366 {
00367     for( ;; )
00368     {
00369         if ( !in->good() ) return false;
00370 
00371         int c = in->peek();
00372         // At this scope, we can't get to a document. So fail silently.
00373         if ( !IsWhiteSpace( c ) || c <= 0 )
00374             return true;
00375 
00376         *tag += (char) in->get();
00377     }
00378 }
00379 
00380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
00381 {
00382     //assert( character > 0 && character < 128 );    // else it won't work in utf-8
00383     while ( in->good() )
00384     {
00385         int c = in->peek();
00386         if ( c == character )
00387             return true;
00388         if ( c <= 0 )        // Silent failure: can't get document at this scope
00389             return false;
00390 
00391         in->get();
00392         *tag += (char) c;
00393     }
00394     return false;
00395 }
00396 #endif
00397 
00398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
00399 // "assign" optimization removes over 10% of the execution time.
00400 //
00401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
00402 {
00403     // Oddly, not supported on some comilers,
00404     //name->clear();
00405     // So use this:
00406     *name = "";
00407     assert( p );
00408 
00409     // Names start with letters or underscores.
00410     // Of course, in unicode, tinyxml has no idea what a letter *is*. The
00411     // algorithm is generous.
00412     //
00413     // After that, they can be letters, underscores, numbers,
00414     // hyphens, or colons. (Colons are valid ony for namespaces,
00415     // but tinyxml can't tell namespaces from names.)
00416     if (    p && *p 
00417          && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
00418     {
00419         const char* start = p;
00420         while(        p && *p
00421                 &&    (        IsAlphaNum( (unsigned char ) *p, encoding ) 
00422                          || *p == '_'
00423                          || *p == '-'
00424                          || *p == '.'
00425                          || *p == ':' ) )
00426         {
00427             //(*name) += *p; // expensive
00428             ++p;
00429         }
00430         if ( p-start > 0 ) {
00431             name->assign( start, p-start );
00432         }
00433         return p;
00434     }
00435     return 0;
00436 }
00437 
00438 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
00439 {
00440     // Presume an entity, and pull it out.
00441     TIXML_STRING ent;
00442     int i;
00443     *length = 0;
00444 
00445     if ( *(p+1) && *(p+1) == '#' && *(p+2) )
00446     {
00447         unsigned long ucs = 0;
00448         ptrdiff_t delta = 0;
00449         unsigned mult = 1;
00450 
00451         if ( *(p+2) == 'x' )
00452         {
00453             // Hexadecimal.
00454             if ( !*(p+3) ) return 0;
00455 
00456             const char* q = p+3;
00457             q = strchr( q, ';' );
00458 
00459             if ( !q || !*q ) return 0;
00460 
00461             delta = q-p;
00462             --q;
00463 
00464             while ( *q != 'x' )
00465             {
00466                 if ( *q >= '0' && *q <= '9' )
00467                     ucs += mult * (*q - '0');
00468                 else if ( *q >= 'a' && *q <= 'f' )
00469                     ucs += mult * (*q - 'a' + 10);
00470                 else if ( *q >= 'A' && *q <= 'F' )
00471                     ucs += mult * (*q - 'A' + 10 );
00472                 else 
00473                     return 0;
00474                 mult *= 16;
00475                 --q;
00476             }
00477         }
00478         else
00479         {
00480             // Decimal.
00481             if ( !*(p+2) ) return 0;
00482 
00483             const char* q = p+2;
00484             q = strchr( q, ';' );
00485 
00486             if ( !q || !*q ) return 0;
00487 
00488             delta = q-p;
00489             --q;
00490 
00491             while ( *q != '#' )
00492             {
00493                 if ( *q >= '0' && *q <= '9' )
00494                     ucs += mult * (*q - '0');
00495                 else 
00496                     return 0;
00497                 mult *= 10;
00498                 --q;
00499             }
00500         }
00501         if ( encoding == TIXML_ENCODING_UTF8 )
00502         {
00503             // convert the UCS to UTF-8
00504             ConvertUTF32ToUTF8( ucs, value, length );
00505         }
00506         else
00507         {
00508             *value = (char)ucs;
00509             *length = 1;
00510         }
00511         return p + delta + 1;
00512     }
00513 
00514     // Now try to match it.
00515     for( i=0; i<NUM_ENTITY; ++i )
00516     {
00517         if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
00518         {
00519             assert( strlen( entity[i].str ) == entity[i].strLength );
00520             *value = entity[i].chr;
00521             *length = 1;
00522             return ( p + entity[i].strLength );
00523         }
00524     }
00525 
00526     // So it wasn't an entity, its unrecognized, or something like that.
00527     *value = *p;    // Don't put back the last one, since we return it!
00528     //*length = 1;    // Leave unrecognized entities - this doesn't really work.
00529                     // Just writes strange XML.
00530     return p+1;
00531 }
00532 
00533 
00534 bool TiXmlBase::StringEqual( const char* p,
00535                              const char* tag,
00536                              bool ignoreCase,
00537                              TiXmlEncoding encoding )
00538 {
00539     assert( p );
00540     assert( tag );
00541     if ( !p || !*p )
00542     {
00543         assert( 0 );
00544         return false;
00545     }
00546 
00547     const char* q = p;
00548 
00549     if ( ignoreCase )
00550     {
00551         while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
00552         {
00553             ++q;
00554             ++tag;
00555         }
00556 
00557         if ( *tag == 0 )
00558             return true;
00559     }
00560     else
00561     {
00562         while ( *q && *tag && *q == *tag )
00563         {
00564             ++q;
00565             ++tag;
00566         }
00567 
00568         if ( *tag == 0 )        // Have we found the end of the tag, and everything equal?
00569             return true;
00570     }
00571     return false;
00572 }
00573 
00574 const char* TiXmlBase::ReadText(    const char* p, 
00575                                     TIXML_STRING * text, 
00576                                     bool trimWhiteSpace, 
00577                                     const char* endTag, 
00578                                     bool caseInsensitive,
00579                                     TiXmlEncoding encoding )
00580 {
00581     *text = "";
00582     if (    !trimWhiteSpace            // certain tags always keep whitespace
00583          || !condenseWhiteSpace )    // if true, whitespace is always kept
00584     {
00585         // Keep all the white space.
00586         while (       p && *p
00587                 && !StringEqual( p, endTag, caseInsensitive, encoding )
00588               )
00589         {
00590             int len;
00591             char cArr[4] = { 0, 0, 0, 0 };
00592             p = GetChar( p, cArr, &len, encoding );
00593             text->append( cArr, len );
00594         }
00595     }
00596     else
00597     {
00598         bool whitespace = false;
00599 
00600         // Remove leading white space:
00601         p = SkipWhiteSpace( p, encoding );
00602         while (       p && *p
00603                 && !StringEqual( p, endTag, caseInsensitive, encoding ) )
00604         {
00605             if ( *p == '\r' || *p == '\n' )
00606             {
00607                 whitespace = true;
00608                 ++p;
00609             }
00610             else if ( IsWhiteSpace( *p ) )
00611             {
00612                 whitespace = true;
00613                 ++p;
00614             }
00615             else
00616             {
00617                 // If we've found whitespace, add it before the
00618                 // new character. Any whitespace just becomes a space.
00619                 if ( whitespace )
00620                 {
00621                     (*text) += ' ';
00622                     whitespace = false;
00623                 }
00624                 int len;
00625                 char cArr[4] = { 0, 0, 0, 0 };
00626                 p = GetChar( p, cArr, &len, encoding );
00627                 if ( len == 1 )
00628                     (*text) += cArr[0];    // more efficient
00629                 else
00630                     text->append( cArr, len );
00631             }
00632         }
00633     }
00634     if ( p && *p ) 
00635         p += strlen( endTag );
00636     return p;
00637 }
00638 
00639 #ifdef TIXML_USE_STL
00640 
00641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
00642 {
00643     // The basic issue with a document is that we don't know what we're
00644     // streaming. Read something presumed to be a tag (and hope), then
00645     // identify it, and call the appropriate stream method on the tag.
00646     //
00647     // This "pre-streaming" will never read the closing ">" so the
00648     // sub-tag can orient itself.
00649 
00650     if ( !StreamTo( in, '<', tag ) ) 
00651     {
00652         SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00653         return;
00654     }
00655 
00656     while ( in->good() )
00657     {
00658         int tagIndex = (int) tag->length();
00659         while ( in->good() && in->peek() != '>' )
00660         {
00661             int c = in->get();
00662             if ( c <= 0 )
00663             {
00664                 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00665                 break;
00666             }
00667             (*tag) += (char) c;
00668         }
00669 
00670         if ( in->good() )
00671         {
00672             // We now have something we presume to be a node of 
00673             // some sort. Identify it, and call the node to
00674             // continue streaming.
00675             TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
00676 
00677             if ( node )
00678             {
00679                 node->StreamIn( in, tag );
00680                 bool isElement = node->ToElement() != 0;
00681                 delete node;
00682                 node = 0;
00683 
00684                 // If this is the root element, we're done. Parsing will be
00685                 // done by the >> operator.
00686                 if ( isElement )
00687                 {
00688                     return;
00689                 }
00690             }
00691             else
00692             {
00693                 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00694                 return;
00695             }
00696         }
00697     }
00698     // We should have returned sooner.
00699     SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00700 }
00701 
00702 #endif
00703 
00704 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
00705 {
00706     ClearError();
00707 
00708     // Parse away, at the document level. Since a document
00709     // contains nothing but other tags, most of what happens
00710     // here is skipping white space.
00711     if ( !p || !*p )
00712     {
00713         SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00714         return 0;
00715     }
00716 
00717     // Note that, for a document, this needs to come
00718     // before the while space skip, so that parsing
00719     // starts from the pointer we are given.
00720     location.Clear();
00721     if ( prevData )
00722     {
00723         location.row = prevData->cursor.row;
00724         location.col = prevData->cursor.col;
00725     }
00726     else
00727     {
00728         location.row = 0;
00729         location.col = 0;
00730     }
00731     TiXmlParsingData data( p, TabSize(), location.row, location.col );
00732     location = data.Cursor();
00733 
00734     if ( encoding == TIXML_ENCODING_UNKNOWN )
00735     {
00736         // Check for the Microsoft UTF-8 lead bytes.
00737         const unsigned char* pU = (const unsigned char*)p;
00738         if (    *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
00739              && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
00740              && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
00741         {
00742             encoding = TIXML_ENCODING_UTF8;
00743             useMicrosoftBOM = true;
00744         }
00745     }
00746 
00747     p = SkipWhiteSpace( p, encoding );
00748     if ( !p )
00749     {
00750         SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00751         return 0;
00752     }
00753 
00754     while ( p && *p )
00755     {
00756         TiXmlNode* node = Identify( p, encoding );
00757         if ( node )
00758         {
00759             p = node->Parse( p, &data, encoding );
00760             LinkEndChild( node );
00761         }
00762         else
00763         {
00764             break;
00765         }
00766 
00767         // Did we get encoding info?
00768         if (    encoding == TIXML_ENCODING_UNKNOWN
00769              && node->ToDeclaration() )
00770         {
00771             TiXmlDeclaration* dec = node->ToDeclaration();
00772             const char* enc = dec->Encoding();
00773             assert( enc );
00774 
00775             if ( *enc == 0 )
00776                 encoding = TIXML_ENCODING_UTF8;
00777             else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
00778                 encoding = TIXML_ENCODING_UTF8;
00779             else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
00780                 encoding = TIXML_ENCODING_UTF8;    // incorrect, but be nice
00781             else 
00782                 encoding = TIXML_ENCODING_LEGACY;
00783         }
00784 
00785         p = SkipWhiteSpace( p, encoding );
00786     }
00787 
00788     // Was this empty?
00789     if ( !firstChild ) {
00790         SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
00791         return 0;
00792     }
00793 
00794     // All is well.
00795     return p;
00796 }
00797 
00798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
00799 {    
00800     // The first error in a chain is more accurate - don't set again!
00801     if ( error )
00802         return;
00803 
00804     assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
00805     error   = true;
00806     errorId = err;
00807     errorDesc = errorString[ errorId ];
00808 
00809     errorLocation.Clear();
00810     if ( pError && data )
00811     {
00812         data->Stamp( pError, encoding );
00813         errorLocation = data->Cursor();
00814     }
00815 }
00816 
00817 
00818 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
00819 {
00820     TiXmlNode* returnNode = 0;
00821 
00822     p = SkipWhiteSpace( p, encoding );
00823     if( !p || !*p || *p != '<' )
00824     {
00825         return 0;
00826     }
00827 
00828     p = SkipWhiteSpace( p, encoding );
00829 
00830     if ( !p || !*p )
00831     {
00832         return 0;
00833     }
00834 
00835     // What is this thing? 
00836     // - Elements start with a letter or underscore, but xml is reserved.
00837     // - Comments: <!--
00838     // - Decleration: <?xml
00839     // - Everthing else is unknown to tinyxml.
00840     //
00841 
00842     const char* xmlHeader = { "<?xml" };
00843     const char* commentHeader = { "<!--" };
00844     const char* dtdHeader = { "<!" };
00845     const char* cdataHeader = { "<![CDATA[" };
00846 
00847     if ( StringEqual( p, xmlHeader, true, encoding ) )
00848     {
00849         #ifdef DEBUG_PARSER
00850             TIXML_LOG( "XML parsing Declaration\n" );
00851         #endif
00852         returnNode = new TiXmlDeclaration();
00853     }
00854     else if ( StringEqual( p, commentHeader, false, encoding ) )
00855     {
00856         #ifdef DEBUG_PARSER
00857             TIXML_LOG( "XML parsing Comment\n" );
00858         #endif
00859         returnNode = new TiXmlComment();
00860     }
00861     else if ( StringEqual( p, cdataHeader, false, encoding ) )
00862     {
00863         #ifdef DEBUG_PARSER
00864             TIXML_LOG( "XML parsing CDATA\n" );
00865         #endif
00866         TiXmlText* text = new TiXmlText( "" );
00867         text->SetCDATA( true );
00868         returnNode = text;
00869     }
00870     else if ( StringEqual( p, dtdHeader, false, encoding ) )
00871     {
00872         #ifdef DEBUG_PARSER
00873             TIXML_LOG( "XML parsing Unknown(1)\n" );
00874         #endif
00875         returnNode = new TiXmlUnknown();
00876     }
00877     else if (    IsAlpha( *(p+1), encoding )
00878               || *(p+1) == '_' )
00879     {
00880         #ifdef DEBUG_PARSER
00881             TIXML_LOG( "XML parsing Element\n" );
00882         #endif
00883         returnNode = new TiXmlElement( "" );
00884     }
00885     else
00886     {
00887         #ifdef DEBUG_PARSER
00888             TIXML_LOG( "XML parsing Unknown(2)\n" );
00889         #endif
00890         returnNode = new TiXmlUnknown();
00891     }
00892 
00893     if ( returnNode )
00894     {
00895         // Set the parent, so it can report errors
00896         returnNode->parent = this;
00897     }
00898     return returnNode;
00899 }
00900 
00901 #ifdef TIXML_USE_STL
00902 
00903 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
00904 {
00905     // We're called with some amount of pre-parsing. That is, some of "this"
00906     // element is in "tag". Go ahead and stream to the closing ">"
00907     while( in->good() )
00908     {
00909         int c = in->get();
00910         if ( c <= 0 )
00911         {
00912             TiXmlDocument* document = GetDocument();
00913             if ( document )
00914                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00915             return;
00916         }
00917         (*tag) += (char) c ;
00918         
00919         if ( c == '>' )
00920             break;
00921     }
00922 
00923     if ( tag->length() < 3 ) return;
00924 
00925     // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
00926     // If not, identify and stream.
00927 
00928     if (    tag->at( tag->length() - 1 ) == '>' 
00929          && tag->at( tag->length() - 2 ) == '/' )
00930     {
00931         // All good!
00932         return;
00933     }
00934     else if ( tag->at( tag->length() - 1 ) == '>' )
00935     {
00936         // There is more. Could be:
00937         //        text
00938         //        cdata text (which looks like another node)
00939         //        closing tag
00940         //        another node.
00941         for ( ;; )
00942         {
00943             StreamWhiteSpace( in, tag );
00944 
00945             // Do we have text?
00946             if ( in->good() && in->peek() != '<' ) 
00947             {
00948                 // Yep, text.
00949                 TiXmlText text( "" );
00950                 text.StreamIn( in, tag );
00951 
00952                 // What follows text is a closing tag or another node.
00953                 // Go around again and figure it out.
00954                 continue;
00955             }
00956 
00957             // We now have either a closing tag...or another node.
00958             // We should be at a "<", regardless.
00959             if ( !in->good() ) return;
00960             assert( in->peek() == '<' );
00961             int tagIndex = (int) tag->length();
00962 
00963             bool closingTag = false;
00964             bool firstCharFound = false;
00965 
00966             for( ;; )
00967             {
00968                 if ( !in->good() )
00969                     return;
00970 
00971                 int c = in->peek();
00972                 if ( c <= 0 )
00973                 {
00974                     TiXmlDocument* document = GetDocument();
00975                     if ( document )
00976                         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00977                     return;
00978                 }
00979                 
00980                 if ( c == '>' )
00981                     break;
00982 
00983                 *tag += (char) c;
00984                 in->get();
00985 
00986                 // Early out if we find the CDATA id.
00987                 if ( c == '[' && tag->size() >= 9 )
00988                 {
00989                     size_t len = tag->size();
00990                     const char* start = tag->c_str() + len - 9;
00991                     if ( strcmp( start, "<![CDATA[" ) == 0 ) {
00992                         assert( !closingTag );
00993                         break;
00994                     }
00995                 }
00996 
00997                 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
00998                 {
00999                     firstCharFound = true;
01000                     if ( c == '/' )
01001                         closingTag = true;
01002                 }
01003             }
01004             // If it was a closing tag, then read in the closing '>' to clean up the input stream.
01005             // If it was not, the streaming will be done by the tag.
01006             if ( closingTag )
01007             {
01008                 if ( !in->good() )
01009                     return;
01010 
01011                 int c = in->get();
01012                 if ( c <= 0 )
01013                 {
01014                     TiXmlDocument* document = GetDocument();
01015                     if ( document )
01016                         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01017                     return;
01018                 }
01019                 assert( c == '>' );
01020                 *tag += (char) c;
01021 
01022                 // We are done, once we've found our closing tag.
01023                 return;
01024             }
01025             else
01026             {
01027                 // If not a closing tag, id it, and stream.
01028                 const char* tagloc = tag->c_str() + tagIndex;
01029                 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
01030                 if ( !node )
01031                     return;
01032                 node->StreamIn( in, tag );
01033                 delete node;
01034                 node = 0;
01035 
01036                 // No return: go around from the beginning: text, closing tag, or node.
01037             }
01038         }
01039     }
01040 }
01041 #endif
01042 
01043 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01044 {
01045     p = SkipWhiteSpace( p, encoding );
01046     TiXmlDocument* document = GetDocument();
01047 
01048     if ( !p || !*p )
01049     {
01050         if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
01051         return 0;
01052     }
01053 
01054     if ( data )
01055     {
01056         data->Stamp( p, encoding );
01057         location = data->Cursor();
01058     }
01059 
01060     if ( *p != '<' )
01061     {
01062         if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
01063         return 0;
01064     }
01065 
01066     p = SkipWhiteSpace( p+1, encoding );
01067 
01068     // Read the name.
01069     const char* pErr = p;
01070 
01071     p = ReadName( p, &value, encoding );
01072     if ( !p || !*p )
01073     {
01074         if ( document )    document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
01075         return 0;
01076     }
01077 
01078     TIXML_STRING endTag ("</");
01079     endTag += value;
01080 
01081     // Check for and read attributes. Also look for an empty
01082     // tag or an end tag.
01083     while ( p && *p )
01084     {
01085         pErr = p;
01086         p = SkipWhiteSpace( p, encoding );
01087         if ( !p || !*p )
01088         {
01089             if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01090             return 0;
01091         }
01092         if ( *p == '/' )
01093         {
01094             ++p;
01095             // Empty tag.
01096             if ( *p  != '>' )
01097             {
01098                 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );        
01099                 return 0;
01100             }
01101             return (p+1);
01102         }
01103         else if ( *p == '>' )
01104         {
01105             // Done with attributes (if there were any.)
01106             // Read the value -- which can include other
01107             // elements -- read the end tag, and return.
01108             ++p;
01109             p = ReadValue( p, data, encoding );        // Note this is an Element method, and will set the error if one happens.
01110             if ( !p || !*p ) {
01111                 // We were looking for the end tag, but found nothing.
01112                 // Fix for [ 1663758 ] Failure to report error on bad XML
01113                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01114                 return 0;
01115             }
01116 
01117             // We should find the end tag now
01118             // note that:
01119             // </foo > and
01120             // </foo> 
01121             // are both valid end tags.
01122             if ( StringEqual( p, endTag.c_str(), false, encoding ) )
01123             {
01124                 p += endTag.length();
01125                 p = SkipWhiteSpace( p, encoding );
01126                 if ( p && *p && *p == '>' ) {
01127                     ++p;
01128                     return p;
01129                 }
01130                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01131                 return 0;
01132             }
01133             else
01134             {
01135                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01136                 return 0;
01137             }
01138         }
01139         else
01140         {
01141             // Try to read an attribute:
01142             TiXmlAttribute* attrib = new TiXmlAttribute();
01143             if ( !attrib )
01144             {
01145                 return 0;
01146             }
01147 
01148             attrib->SetDocument( document );
01149             pErr = p;
01150             p = attrib->Parse( p, data, encoding );
01151 
01152             if ( !p || !*p )
01153             {
01154                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
01155                 delete attrib;
01156                 return 0;
01157             }
01158 
01159             // Handle the strange case of double attributes:
01160             #ifdef TIXML_USE_STL
01161             TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
01162             #else
01163             TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
01164             #endif
01165             if ( node )
01166             {
01167                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
01168                 delete attrib;
01169                 return 0;
01170             }
01171 
01172             attributeSet.Add( attrib );
01173         }
01174     }
01175     return p;
01176 }
01177 
01178 
01179 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01180 {
01181     TiXmlDocument* document = GetDocument();
01182 
01183     // Read in text and elements in any order.
01184     const char* pWithWhiteSpace = p;
01185     p = SkipWhiteSpace( p, encoding );
01186 
01187     while ( p && *p )
01188     {
01189         if ( *p != '<' )
01190         {
01191             // Take what we have, make a text element.
01192             TiXmlText* textNode = new TiXmlText( "" );
01193 
01194             if ( !textNode )
01195             {
01196                 return 0;
01197             }
01198 
01199             if ( TiXmlBase::IsWhiteSpaceCondensed() )
01200             {
01201                 p = textNode->Parse( p, data, encoding );
01202             }
01203             else
01204             {
01205                 // Special case: we want to keep the white space
01206                 // so that leading spaces aren't removed.
01207                 p = textNode->Parse( pWithWhiteSpace, data, encoding );
01208             }
01209 
01210             if ( !textNode->Blank() )
01211                 LinkEndChild( textNode );
01212             else
01213                 delete textNode;
01214         } 
01215         else 
01216         {
01217             // We hit a '<'
01218             // Have we hit a new element or an end tag? This could also be
01219             // a TiXmlText in the "CDATA" style.
01220             if ( StringEqual( p, "</", false, encoding ) )
01221             {
01222                 return p;
01223             }
01224             else
01225             {
01226                 TiXmlNode* node = Identify( p, encoding );
01227                 if ( node )
01228                 {
01229                     p = node->Parse( p, data, encoding );
01230                     LinkEndChild( node );
01231                 }                
01232                 else
01233                 {
01234                     return 0;
01235                 }
01236             }
01237         }
01238         pWithWhiteSpace = p;
01239         p = SkipWhiteSpace( p, encoding );
01240     }
01241 
01242     if ( !p )
01243     {
01244         if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
01245     }    
01246     return p;
01247 }
01248 
01249 
01250 #ifdef TIXML_USE_STL
01251 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
01252 {
01253     while ( in->good() )
01254     {
01255         int c = in->get();    
01256         if ( c <= 0 )
01257         {
01258             TiXmlDocument* document = GetDocument();
01259             if ( document )
01260                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01261             return;
01262         }
01263         (*tag) += (char) c;
01264 
01265         if ( c == '>' )
01266         {
01267             // All is well.
01268             return;        
01269         }
01270     }
01271 }
01272 #endif
01273 
01274 
01275 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01276 {
01277     TiXmlDocument* document = GetDocument();
01278     p = SkipWhiteSpace( p, encoding );
01279 
01280     if ( data )
01281     {
01282         data->Stamp( p, encoding );
01283         location = data->Cursor();
01284     }
01285     if ( !p || !*p || *p != '<' )
01286     {
01287         if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
01288         return 0;
01289     }
01290     ++p;
01291     value = "";
01292 
01293     while ( p && *p && *p != '>' )
01294     {
01295         value += *p;
01296         ++p;
01297     }
01298 
01299     if ( !p )
01300     {
01301         if ( document )    document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
01302     }
01303     if ( *p == '>' )
01304         return p+1;
01305     return p;
01306 }
01307 
01308 #ifdef TIXML_USE_STL
01309 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
01310 {
01311     while ( in->good() )
01312     {
01313         int c = in->get();    
01314         if ( c <= 0 )
01315         {
01316             TiXmlDocument* document = GetDocument();
01317             if ( document )
01318                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01319             return;
01320         }
01321 
01322         (*tag) += (char) c;
01323 
01324         if ( c == '>' 
01325              && tag->at( tag->length() - 2 ) == '-'
01326              && tag->at( tag->length() - 3 ) == '-' )
01327         {
01328             // All is well.
01329             return;        
01330         }
01331     }
01332 }
01333 #endif
01334 
01335 
01336 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01337 {
01338     TiXmlDocument* document = GetDocument();
01339     value = "";
01340 
01341     p = SkipWhiteSpace( p, encoding );
01342 
01343     if ( data )
01344     {
01345         data->Stamp( p, encoding );
01346         location = data->Cursor();
01347     }
01348     const char* startTag = "<!--";
01349     const char* endTag   = "-->";
01350 
01351     if ( !StringEqual( p, startTag, false, encoding ) )
01352     {
01353         document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
01354         return 0;
01355     }
01356     p += strlen( startTag );
01357 
01358     // [ 1475201 ] TinyXML parses entities in comments
01359     // Oops - ReadText doesn't work, because we don't want to parse the entities.
01360     // p = ReadText( p, &value, false, endTag, false, encoding );
01361     //
01362     // from the XML spec:
01363     /*
01364      [Definition: Comments may appear anywhere in a document outside other markup; in addition, 
01365                   they may appear within the document type declaration at places allowed by the grammar. 
01366                   They are not part of the document's character data; an XML processor MAY, but need not, 
01367                   make it possible for an application to retrieve the text of comments. For compatibility, 
01368                   the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity 
01369                   references MUST NOT be recognized within comments.
01370 
01371                   An example of a comment:
01372 
01373                   <!-- declarations for <head> & <body> -->
01374     */
01375 
01376     value = "";
01377     // Keep all the white space.
01378     while (    p && *p && !StringEqual( p, endTag, false, encoding ) )
01379     {
01380         value.append( p, 1 );
01381         ++p;
01382     }
01383     if ( p && *p ) 
01384         p += strlen( endTag );
01385 
01386     return p;
01387 }
01388 
01389 
01390 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01391 {
01392     p = SkipWhiteSpace( p, encoding );
01393     if ( !p || !*p ) return 0;
01394 
01395     if ( data )
01396     {
01397         data->Stamp( p, encoding );
01398         location = data->Cursor();
01399     }
01400     // Read the name, the '=' and the value.
01401     const char* pErr = p;
01402     p = ReadName( p, &name, encoding );
01403     if ( !p || !*p )
01404     {
01405         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01406         return 0;
01407     }
01408     p = SkipWhiteSpace( p, encoding );
01409     if ( !p || !*p || *p != '=' )
01410     {
01411         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01412         return 0;
01413     }
01414 
01415     ++p;    // skip '='
01416     p = SkipWhiteSpace( p, encoding );
01417     if ( !p || !*p )
01418     {
01419         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01420         return 0;
01421     }
01422     
01423     const char* end;
01424     const char SINGLE_QUOTE = '\'';
01425     const char DOUBLE_QUOTE = '\"';
01426 
01427     if ( *p == SINGLE_QUOTE )
01428     {
01429         ++p;
01430         end = "\'";        // single quote in string
01431         p = ReadText( p, &value, false, end, false, encoding );
01432     }
01433     else if ( *p == DOUBLE_QUOTE )
01434     {
01435         ++p;
01436         end = "\"";        // double quote in string
01437         p = ReadText( p, &value, false, end, false, encoding );
01438     }
01439     else
01440     {
01441         // All attribute values should be in single or double quotes.
01442         // But this is such a common error that the parser will try
01443         // its best, even without them.
01444         value = "";
01445         while (    p && *p                                            // existence
01446                 && !IsWhiteSpace( *p )                                // whitespace
01447                 && *p != '/' && *p != '>' )                            // tag end
01448         {
01449             if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
01450                 // [ 1451649 ] Attribute values with trailing quotes not handled correctly
01451                 // We did not have an opening quote but seem to have a 
01452                 // closing one. Give up and throw an error.
01453                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01454                 return 0;
01455             }
01456             value += *p;
01457             ++p;
01458         }
01459     }
01460     return p;
01461 }
01462 
01463 #ifdef TIXML_USE_STL
01464 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
01465 {
01466     while ( in->good() )
01467     {
01468         int c = in->peek();    
01469         if ( !cdata && (c == '<' ) ) 
01470         {
01471             return;
01472         }
01473         if ( c <= 0 )
01474         {
01475             TiXmlDocument* document = GetDocument();
01476             if ( document )
01477                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01478             return;
01479         }
01480 
01481         (*tag) += (char) c;
01482         in->get();    // "commits" the peek made above
01483 
01484         if ( cdata && c == '>' && tag->size() >= 3 ) {
01485             size_t len = tag->size();
01486             if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
01487                 // terminator of cdata.
01488                 return;
01489             }
01490         }    
01491     }
01492 }
01493 #endif
01494 
01495 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01496 {
01497     value = "";
01498     TiXmlDocument* document = GetDocument();
01499 
01500     if ( data )
01501     {
01502         data->Stamp( p, encoding );
01503         location = data->Cursor();
01504     }
01505 
01506     const char* const startTag = "<![CDATA[";
01507     const char* const endTag   = "]]>";
01508 
01509     if ( cdata || StringEqual( p, startTag, false, encoding ) )
01510     {
01511         cdata = true;
01512 
01513         if ( !StringEqual( p, startTag, false, encoding ) )
01514         {
01515             document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
01516             return 0;
01517         }
01518         p += strlen( startTag );
01519 
01520         // Keep all the white space, ignore the encoding, etc.
01521         while (       p && *p
01522                 && !StringEqual( p, endTag, false, encoding )
01523               )
01524         {
01525             value += *p;
01526             ++p;
01527         }
01528 
01529         TIXML_STRING dummy; 
01530         p = ReadText( p, &dummy, false, endTag, false, encoding );
01531         return p;
01532     }
01533     else
01534     {
01535         bool ignoreWhite = true;
01536 
01537         const char* end = "<";
01538         p = ReadText( p, &value, ignoreWhite, end, false, encoding );
01539         if ( p )
01540             return p-1;    // don't truncate the '<'
01541         return 0;
01542     }
01543 }
01544 
01545 #ifdef TIXML_USE_STL
01546 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
01547 {
01548     while ( in->good() )
01549     {
01550         int c = in->get();
01551         if ( c <= 0 )
01552         {
01553             TiXmlDocument* document = GetDocument();
01554             if ( document )
01555                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01556             return;
01557         }
01558         (*tag) += (char) c;
01559 
01560         if ( c == '>' )
01561         {
01562             // All is well.
01563             return;
01564         }
01565     }
01566 }
01567 #endif
01568 
01569 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
01570 {
01571     p = SkipWhiteSpace( p, _encoding );
01572     // Find the beginning, find the end, and look for
01573     // the stuff in-between.
01574     TiXmlDocument* document = GetDocument();
01575     if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
01576     {
01577         if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
01578         return 0;
01579     }
01580     if ( data )
01581     {
01582         data->Stamp( p, _encoding );
01583         location = data->Cursor();
01584     }
01585     p += 5;
01586 
01587     version = "";
01588     encoding = "";
01589     standalone = "";
01590 
01591     while ( p && *p )
01592     {
01593         if ( *p == '>' )
01594         {
01595             ++p;
01596             return p;
01597         }
01598 
01599         p = SkipWhiteSpace( p, _encoding );
01600         if ( StringEqual( p, "version", true, _encoding ) )
01601         {
01602             TiXmlAttribute attrib;
01603             p = attrib.Parse( p, data, _encoding );        
01604             version = attrib.Value();
01605         }
01606         else if ( StringEqual( p, "encoding", true, _encoding ) )
01607         {
01608             TiXmlAttribute attrib;
01609             p = attrib.Parse( p, data, _encoding );        
01610             encoding = attrib.Value();
01611         }
01612         else if ( StringEqual( p, "standalone", true, _encoding ) )
01613         {
01614             TiXmlAttribute attrib;
01615             p = attrib.Parse( p, data, _encoding );        
01616             standalone = attrib.Value();
01617         }
01618         else
01619         {
01620             // Read over whatever it is.
01621             while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
01622                 ++p;
01623         }
01624     }
01625     return 0;
01626 }
01627 
01628 bool TiXmlText::Blank() const
01629 {
01630     for ( unsigned i=0; i<value.length(); i++ )
01631         if ( !IsWhiteSpace( value[i] ) )
01632             return false;
01633     return true;
01634 }