lib/third_party/tinyxml/tinyxmlparser.cpp

   1 /*
   2   www.sourceforge.net/projects/tinyxml
   3   Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
   4
   5   This software is provided 'as-is', without any express or implied
   6   warranty. In no event will the authors be held liable for any
   7   damages arising from the use of this software.
   8
   9   Permission is granted to anyone to use this software for any
  10   purpose, including commercial applications, and to alter it and
  11   redistribute it freely, subject to the following restrictions:
  12
  13   1. The origin of this software must not be misrepresented; you must
  14   not claim that you wrote the original software. If you use this
  15   software in a product, an acknowledgment in the product documentation
  16   would be appreciated but is not required.
  17
  18   2. Altered source versions must be plainly marked as such, and
  19   must not be misrepresented as being the original software.
  20
  21   3. This notice may not be removed or altered from any source
  22   distribution.
  23 */
  24
  25 #include <ctype.h>
  26 #include <stddef.h>
  27
  28 #include "tinyxml.h"
  29
  30 //#define DEBUG_PARSER
  31 #if defined( DEBUG_PARSER )
  32 #       if defined( DEBUG ) && defined( _MSC_VER )
  33 #               include <windows.h>
  34 #               define TIXML_LOG OutputDebugString
  35 #       else
  36 #               define TIXML_LOG printf
  37 #       endif
  38 #endif
  39
  40 // Note tha "PutString" hardcodes the same list. This
  41 // is less flexible than it appears. Changing the entries
  42 // or order will break putstring.
  43 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
  44 {
  45   { "&amp;",  5, '&' },
  46   { "&lt;",   4, '<' },
  47   { "&gt;",   4, '>' },
  48   { "&quot;", 6, '\"' },
  49   { "&apos;", 6, '\'' }
  50 };
  51
  52 // Bunch of unicode info at:
  53 //              http://www.unicode.org/faq/utf_bom.html
  54 // Including the basic of this table, which determines the #bytes in the
  55 // sequence from the lead byte. 1 placed for invalid sequences --
  56 // although the result will be junk, pass it through as much as possible.
  57 // Beware of the non-characters in UTF-8:
  58 //                              ef bb bf (Microsoft "lead bytes")
  59 //                              ef bf be
  60 //                              ef bf bf
  61
  62 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
  63 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
  64 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
  65
  66 const int TiXmlBase::utf8ByteTable[256] =
  67 {
  68   //      0       1       2       3       4       5       6       7       8       9       a       b       c       d       e       f
  69   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x00
  70   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x10
  71   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x20
  72   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x30
  73   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x40
  74   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x50
  75   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x60
  76   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x70 End of ASCII range
  77   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x80 0x80 to 0xc1 invalid
  78   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x90
  79   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0xa0
  80   1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0xb0
  81   1,      1,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      // 0xc0 0xc2 to 0xdf 2 byte
  82   2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      // 0xd0
  83   3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      // 0xe0 0xe0 to 0xef 3 byte
  84   4,      4,      4,      4,      4,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1       // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
  85 };
  86
  87
  88 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
  89 {
  90   const unsigned long BYTE_MASK = 0xBF;
  91   const unsigned long BYTE_MARK = 0x80;
  92   const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  93
  94   if (input < 0x80)
  95     *length = 1;
  96   else if ( input < 0x800 )
  97     *length = 2;
  98   else if ( input < 0x10000 )
  99     *length = 3;
 100   else if ( input < 0x200000 )
 101     *length = 4;
 102   else
 103   { *length = 0; return; }        // This code won't covert this correctly anyway.
 104
 105   output += *length;
 106
 107   // Scary scary fall throughs.
 108   switch (*length)
 109   {
 110   case 4:
 111     --output;
 112     *output = (char)((input | BYTE_MARK) & BYTE_MASK);
 113     input >>= 6;
 114   case 3:
 115     --output;
 116     *output = (char)((input | BYTE_MARK) & BYTE_MASK);
 117     input >>= 6;
 118   case 2:
 119     --output;
 120     *output = (char)((input | BYTE_MARK) & BYTE_MASK);
 121     input >>= 6;
 122   case 1:
 123     --output;
 124     *output = (char)(input | FIRST_BYTE_MARK[*length]);
 125   }
 126 }
 127
 128
 129 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
 130 {
 131   // This will only work for low-ascii, everything else is assumed to be a valid
 132   // letter. I'm not sure this is the best approach, but it is quite tricky trying
 133   // to figure out alhabetical vs. not across encoding. So take a very
 134   // conservative approach.
 135
 136 //      if ( encoding == TIXML_ENCODING_UTF8 )
 137 //      {
 138   if ( anyByte < 127 )
 139     return isalpha( anyByte );
 140   else
 141     return 1;       // What else to do? The unicode set is huge...get the english ones right.
 142 //      }
 143 //      else
 144 //      {
 145 //              return isalpha( anyByte );
 146 //      }
 147 }
 148
 149
 150 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
 151 {
 152   // This will only work for low-ascii, everything else is assumed to be a valid
 153   // letter. I'm not sure this is the best approach, but it is quite tricky trying
 154   // to figure out alhabetical vs. not across encoding. So take a very
 155   // conservative approach.
 156
 157 //      if ( encoding == TIXML_ENCODING_UTF8 )
 158 //      {
 159   if ( anyByte < 127 )
 160     return isalnum( anyByte );
 161   else
 162     return 1;       // What else to do? The unicode set is huge...get the english ones right.
 163 //      }
 164 //      else
 165 //      {
 166 //              return isalnum( anyByte );
 167 //      }
 168 }
 169
 170
 171 class TiXmlParsingData
 172 {
 173   friend class TiXmlDocument;
 174 public:
 175   void Stamp( const char* now, TiXmlEncoding encoding );
 176
 177   const TiXmlCursor& Cursor()     { return cursor; }
 178
 179 private:
 180   // Only used by the document!
 181   TiXmlParsingData( const char* start, int _tabsize, int row, int col )
 182     {
 183       assert( start );
 184       stamp = start;
 185       tabsize = _tabsize;
 186       cursor.row = row;
 187       cursor.col = col;
 188     }
 189
 190   TiXmlCursor             cursor;
 191   const char*             stamp;
 192   int                             tabsize;
 193 };
 194
 195
 196 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
 197 {
 198   assert( now );
 199
 200   // Do nothing if the tabsize is 0.
 201   if ( tabsize < 1 )
 202   {
 203     return;
 204   }
 205
 206   // Get the current row, column.
 207   int row = cursor.row;
 208   int col = cursor.col;
 209   const char* p = stamp;
 210   assert( p );
 211
 212   while ( p < now )
 213   {
 214     // Treat p as unsigned, so we have a happy compiler.
 215     const unsigned char* pU = (const unsigned char*)p;
 216
 217     // Code contributed by Fletcher Dunn: (modified by lee)
 218     switch (*pU) {
 219     case 0:
 220       // We *should* never get here, but in case we do, don't
 221       // advance past the terminating null character, ever
 222       return;
 223
 224     case '\r':
 225       // bump down to the next line
 226       ++row;
 227       col = 0;
 228       // Eat the character
 229       ++p;
 230
 231       // Check for \r\n sequence, and treat this as a single character
 232       if (*p == '\n') {
 233         ++p;
 234       }
 235       break;
 236
 237     case '\n':
 238       // bump down to the next line
 239       ++row;
 240       col = 0;
 241
 242       // Eat the character
 243       ++p;
 244
 245       // Check for \n\r sequence, and treat this as a single
 246       // character.  (Yes, this bizarre thing does occur still
 247       // on some arcane platforms...)
 248       if (*p == '\r') {
 249         ++p;
 250       }
 251       break;
 252
 253     case '\t':
 254       // Eat the character
 255       ++p;
 256
 257       // Skip to next tab stop
 258       col = (col / tabsize + 1) * tabsize;
 259       break;
 260
 261     case TIXML_UTF_LEAD_0:
 262       if ( encoding == TIXML_ENCODING_UTF8 )
 263       {
 264         if ( *(p+1) && *(p+2) )
 265         {
 266           // In these cases, don't advance the column. These are
 267           // 0-width spaces.
 268           if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
 269             p += 3;
 270           else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
 271             p += 3;
 272           else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
 273             p += 3;
 274           else
 275           { p +=3; ++col; }       // A normal character.
 276         }
 277       }
 278       else
 279       {
 280         ++p;
 281         ++col;
 282       }
 283       break;
 284
 285     default:
 286       if ( encoding == TIXML_ENCODING_UTF8 )
 287       {
 288         // Eat the 1 to 4 byte utf8 character.
 289         int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
 290         if ( step == 0 )
 291           step = 1;               // Error case from bad encoding, but handle gracefully.
 292         p += step;
 293
 294         // Just advance one column, of course.
 295         ++col;
 296       }
 297       else
 298       {
 299         ++p;
 300         ++col;
 301       }
 302       break;
 303     }
 304   }
 305   cursor.row = row;
 306   cursor.col = col;
 307   assert( cursor.row >= -1 );
 308   assert( cursor.col >= -1 );
 309   stamp = p;
 310   assert( stamp );
 311 }
 312
 313
 314 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
 315 {
 316   if ( !p || !*p )
 317   {
 318     return 0;
 319   }
 320   if ( encoding == TIXML_ENCODING_UTF8 )
 321   {
 322     while ( *p )
 323     {
 324       const unsigned char* pU = (const unsigned char*)p;
 325
 326       // Skip the stupid Microsoft UTF-8 Byte order marks
 327       if (    *(pU+0)==TIXML_UTF_LEAD_0
 328               && *(pU+1)==TIXML_UTF_LEAD_1
 329               && *(pU+2)==TIXML_UTF_LEAD_2 )
 330       {
 331         p += 3;
 332         continue;
 333       }
 334       else if(*(pU+0)==TIXML_UTF_LEAD_0
 335               && *(pU+1)==0xbfU
 336               && *(pU+2)==0xbeU )
 337       {
 338         p += 3;
 339         continue;
 340       }
 341       else if(*(pU+0)==TIXML_UTF_LEAD_0
 342               && *(pU+1)==0xbfU
 343               && *(pU+2)==0xbfU )
 344       {
 345         p += 3;
 346         continue;
 347       }
 348
 349       if ( IsWhiteSpace( *p ) )               // Still using old rules for white space.
 350         ++p;
 351       else
 352         break;
 353     }
 354   }
 355   else
 356   {
 357     while ( *p && IsWhiteSpace( *p ) )
 358       ++p;
 359   }
 360
 361   return p;
 362 }
 363
 364 #ifdef TIXML_USE_STL
 365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
 366 {
 367   for( ;; )
 368   {
 369     if ( !in->good() ) return false;
 370
 371     int c = in->peek();
 372     // At this scope, we can't get to a document. So fail silently.
 373     if ( !IsWhiteSpace( c ) || c <= 0 )
 374       return true;
 375
 376     *tag += (char) in->get();
 377   }
 378 }
 379
 380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
 381 {
 382   //assert( character > 0 && character < 128 );   // else it won't work in utf-8
 383   while ( in->good() )
 384   {
 385     int c = in->peek();
 386     if ( c == character )
 387       return true;
 388     if ( c <= 0 )           // Silent failure: can't get document at this scope
 389       return false;
 390
 391     in->get();
 392     *tag += (char) c;
 393   }
 394   return false;
 395 }
 396 #endif
 397
 398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
 399 // "assign" optimization removes over 10% of the execution time.
 400 //
 401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
 402 {
 403   // Oddly, not supported on some comilers,
 404   //name->clear();
 405   // So use this:
 406   *name = "";
 407   assert( p );
 408
 409   // Names start with letters or underscores.
 410   // Of course, in unicode, tinyxml has no idea what a letter *is*. The
 411   // algorithm is generous.
 412   //
 413   // After that, they can be letters, underscores, numbers,
 414   // hyphens, or colons. (Colons are valid ony for namespaces,
 415   // but tinyxml can't tell namespaces from names.)
 416   if (    p && *p
 417           && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
 418   {
 419     const char* start = p;
 420     while(          p && *p
 421                     &&      (               IsAlphaNum( (unsigned char ) *p, encoding )
 422                                             || *p == '_'
 423                                             || *p == '-'
 424                                             || *p == '.'
 425                                             || *p == ':' ) )
 426     {
 427       //(*name) += *p; // expensive
 428       ++p;
 429     }
 430     if ( p-start > 0 ) {
 431       name->assign( start, p-start );
 432     }
 433     return p;
 434   }
 435   return 0;
 436 }
 437
 438 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
 439 {
 440   // Presume an entity, and pull it out.
 441   TIXML_STRING ent;
 442   int i;
 443   *length = 0;
 444
 445   if ( *(p+1) && *(p+1) == '#' && *(p+2) )
 446   {
 447     unsigned long ucs = 0;
 448     ptrdiff_t delta = 0;
 449     unsigned mult = 1;
 450
 451     if ( *(p+2) == 'x' )
 452     {
 453       // Hexadecimal.
 454       if ( !*(p+3) ) return 0;
 455
 456       const char* q = p+3;
 457       q = strchr( q, ';' );
 458
 459       if ( !q || !*q ) return 0;
 460
 461       delta = q-p;
 462       --q;
 463
 464       while ( *q != 'x' )
 465       {
 466         if ( *q >= '0' && *q <= '9' )
 467           ucs += mult * (*q - '0');
 468         else if ( *q >= 'a' && *q <= 'f' )
 469           ucs += mult * (*q - 'a' + 10);
 470         else if ( *q >= 'A' && *q <= 'F' )
 471           ucs += mult * (*q - 'A' + 10 );
 472         else
 473           return 0;
 474         mult *= 16;
 475         --q;
 476       }
 477     }
 478     else
 479     {
 480       // Decimal.
 481       if ( !*(p+2) ) return 0;
 482
 483       const char* q = p+2;
 484       q = strchr( q, ';' );
 485
 486       if ( !q || !*q ) return 0;
 487
 488       delta = q-p;
 489       --q;
 490
 491       while ( *q != '#' )
 492       {
 493         if ( *q >= '0' && *q <= '9' )
 494           ucs += mult * (*q - '0');
 495         else
 496           return 0;
 497         mult *= 10;
 498         --q;
 499       }
 500     }
 501     if ( encoding == TIXML_ENCODING_UTF8 )
 502     {
 503       // convert the UCS to UTF-8
 504       ConvertUTF32ToUTF8( ucs, value, length );
 505     }
 506     else
 507     {
 508       *value = (char)ucs;
 509       *length = 1;
 510     }
 511     return p + delta + 1;
 512   }
 513
 514   // Now try to match it.
 515   for( i=0; i<NUM_ENTITY; ++i )
 516   {
 517     if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
 518     {
 519       assert( strlen( entity[i].str ) == entity[i].strLength );
 520       *value = entity[i].chr;
 521       *length = 1;
 522       return ( p + entity[i].strLength );
 523     }
 524   }
 525
 526   // So it wasn't an entity, its unrecognized, or something like that.
 527   *value = *p;    // Don't put back the last one, since we return it!
 528   //*length = 1;  // Leave unrecognized entities - this doesn't really work.
 529   // Just writes strange XML.
 530   return p+1;
 531 }
 532
 533
 534 bool TiXmlBase::StringEqual( const char* p,
 535                              const char* tag,
 536                              bool ignoreCase,
 537                              TiXmlEncoding encoding )
 538 {
 539   assert( p );
 540   assert( tag );
 541   if ( !p || !*p )
 542   {
 543     assert( 0 );
 544     return false;
 545   }
 546
 547   const char* q = p;
 548
 549   if ( ignoreCase )
 550   {
 551     while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
 552     {
 553       ++q;
 554       ++tag;
 555     }
 556
 557     if ( *tag == 0 )
 558       return true;
 559   }
 560   else
 561   {
 562     while ( *q && *tag && *q == *tag )
 563     {
 564       ++q;
 565       ++tag;
 566     }
 567
 568     if ( *tag == 0 )                // Have we found the end of the tag, and everything equal?
 569       return true;
 570   }
 571   return false;
 572 }
 573
 574 const char* TiXmlBase::ReadText(        const char* p,
 575                                         TIXML_STRING * text,
 576                                         bool trimWhiteSpace,
 577                                         const char* endTag,
 578                                         bool caseInsensitive,
 579                                         TiXmlEncoding encoding )
 580 {
 581   *text = "";
 582   if (    !trimWhiteSpace                 // certain tags always keep whitespace
 583           || !condenseWhiteSpace )       // if true, whitespace is always kept
 584   {
 585     // Keep all the white space.
 586     while (    p && *p
 587                && !StringEqual( p, endTag, caseInsensitive, encoding )
 588       )
 589     {
 590       int len;
 591       char cArr[4] = { 0, 0, 0, 0 };
 592       p = GetChar( p, cArr, &len, encoding );
 593       text->append( cArr, len );
 594     }
 595   }
 596   else
 597   {
 598     bool whitespace = false;
 599
 600     // Remove leading white space:
 601     p = SkipWhiteSpace( p, encoding );
 602     while (    p && *p
 603                && !StringEqual( p, endTag, caseInsensitive, encoding ) )
 604     {
 605       if ( *p == '\r' || *p == '\n' )
 606       {
 607         whitespace = true;
 608         ++p;
 609       }
 610       else if ( IsWhiteSpace( *p ) )
 611       {
 612         whitespace = true;
 613         ++p;
 614       }
 615       else
 616       {
 617         // If we've found whitespace, add it before the
 618         // new character. Any whitespace just becomes a space.
 619         if ( whitespace )
 620         {
 621           (*text) += ' ';
 622           whitespace = false;
 623         }
 624         int len;
 625         char cArr[4] = { 0, 0, 0, 0 };
 626         p = GetChar( p, cArr, &len, encoding );
 627         if ( len == 1 )
 628           (*text) += cArr[0];     // more efficient
 629         else
 630           text->append( cArr, len );
 631       }
 632     }
 633   }
 634   if ( p && *p )
 635     p += strlen( endTag );
 636   return p;
 637 }
 638
 639 #ifdef TIXML_USE_STL
 640
 641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
 642 {
 643   // The basic issue with a document is that we don't know what we're
 644   // streaming. Read something presumed to be a tag (and hope), then
 645   // identify it, and call the appropriate stream method on the tag.
 646   //
 647   // This "pre-streaming" will never read the closing ">" so the
 648   // sub-tag can orient itself.
 649
 650   if ( !StreamTo( in, '<', tag ) )
 651   {
 652     SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
 653     return;
 654   }
 655
 656   while ( in->good() )
 657   {
 658     int tagIndex = (int) tag->length();
 659     while ( in->good() && in->peek() != '>' )
 660     {
 661       int c = in->get();
 662       if ( c <= 0 )
 663       {
 664         SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
 665         break;
 666       }
 667       (*tag) += (char) c;
 668     }
 669
 670     if ( in->good() )
 671     {
 672       // We now have something we presume to be a node of
 673       // some sort. Identify it, and call the node to
 674       // continue streaming.
 675       TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
 676
 677       if ( node )
 678       {
 679         node->StreamIn( in, tag );
 680         bool isElement = node->ToElement() != 0;
 681         delete node;
 682         node = 0;
 683
 684         // If this is the root element, we're done. Parsing will be
 685         // done by the >> operator.
 686         if ( isElement )
 687         {
 688           return;
 689         }
 690       }
 691       else
 692       {
 693         SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
 694         return;
 695       }
 696     }
 697   }
 698   // We should have returned sooner.
 699   SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
 700 }
 701
 702 #endif
 703
 704 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
 705 {
 706   ClearError();
 707
 708   // Parse away, at the document level. Since a document
 709   // contains nothing but other tags, most of what happens
 710   // here is skipping white space.
 711   if ( !p || !*p )
 712   {
 713     SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
 714     return 0;
 715   }
 716
 717   // Note that, for a document, this needs to come
 718   // before the while space skip, so that parsing
 719   // starts from the pointer we are given.
 720   location.Clear();
 721   if ( prevData )
 722   {
 723     location.row = prevData->cursor.row;
 724     location.col = prevData->cursor.col;
 725   }
 726   else
 727   {
 728     location.row = 0;
 729     location.col = 0;
 730   }
 731   TiXmlParsingData data( p, TabSize(), location.row, location.col );
 732   location = data.Cursor();
 733
 734   if ( encoding == TIXML_ENCODING_UNKNOWN )
 735   {
 736     // Check for the Microsoft UTF-8 lead bytes.
 737     const unsigned char* pU = (const unsigned char*)p;
 738     if (    *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
 739             && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
 740             && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
 741     {
 742       encoding = TIXML_ENCODING_UTF8;
 743       useMicrosoftBOM = true;
 744     }
 745   }
 746
 747   p = SkipWhiteSpace( p, encoding );
 748   if ( !p )
 749   {
 750     SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
 751     return 0;
 752   }
 753
 754   while ( p && *p )
 755   {
 756     TiXmlNode* node = Identify( p, encoding );
 757     if ( node )
 758     {
 759       p = node->Parse( p, &data, encoding );
 760       LinkEndChild( node );
 761     }
 762     else
 763     {
 764       break;
 765     }
 766
 767     // Did we get encoding info?
 768     if (    encoding == TIXML_ENCODING_UNKNOWN
 769             && node->ToDeclaration() )
 770     {
 771       TiXmlDeclaration* dec = node->ToDeclaration();
 772       const char* enc = dec->Encoding();
 773       assert( enc );
 774
 775       if ( *enc == 0 )
 776         encoding = TIXML_ENCODING_UTF8;
 777       else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
 778         encoding = TIXML_ENCODING_UTF8;
 779       else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
 780         encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
 781       else
 782         encoding = TIXML_ENCODING_LEGACY;
 783     }
 784
 785     p = SkipWhiteSpace( p, encoding );
 786   }
 787
 788   // Was this empty?
 789   if ( !firstChild ) {
 790     SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
 791     return 0;
 792   }
 793
 794   // All is well.
 795   return p;
 796 }
 797
 798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
 799 {
 800   // The first error in a chain is more accurate - don't set again!
 801   if ( error )
 802     return;
 803
 804   assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
 805   error   = true;
 806   errorId = err;
 807   errorDesc = errorString[ errorId ];
 808
 809   errorLocation.Clear();
 810   if ( pError && data )
 811   {
 812     data->Stamp( pError, encoding );
 813     errorLocation = data->Cursor();
 814   }
 815 }
 816
 817
 818 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
 819 {
 820   TiXmlNode* returnNode = 0;
 821
 822   p = SkipWhiteSpace( p, encoding );
 823   if( !p || !*p || *p != '<' )
 824   {
 825     return 0;
 826   }
 827
 828   p = SkipWhiteSpace( p, encoding );
 829
 830   if ( !p || !*p )
 831   {
 832     return 0;
 833   }
 834
 835   // What is this thing?
 836   // - Elements start with a letter or underscore, but xml is reserved.
 837   // - Comments: <!--
 838   // - Decleration: <?xml
 839   // - Everthing else is unknown to tinyxml.
 840   //
 841
 842   const char* xmlHeader = { "<?xml" };
 843   const char* commentHeader = { "<!--" };
 844   const char* dtdHeader = { "<!" };
 845   const char* cdataHeader = { "<![CDATA[" };
 846
 847   if ( StringEqual( p, xmlHeader, true, encoding ) )
 848   {
 849 #ifdef DEBUG_PARSER
 850     TIXML_LOG( "XML parsing Declaration\n" );
 851 #endif
 852     returnNode = new TiXmlDeclaration();
 853   }
 854   else if ( StringEqual( p, commentHeader, false, encoding ) )
 855   {
 856 #ifdef DEBUG_PARSER
 857     TIXML_LOG( "XML parsing Comment\n" );
 858 #endif
 859     returnNode = new TiXmlComment();
 860   }
 861   else if ( StringEqual( p, cdataHeader, false, encoding ) )
 862   {
 863 #ifdef DEBUG_PARSER
 864     TIXML_LOG( "XML parsing CDATA\n" );
 865 #endif
 866     TiXmlText* text = new TiXmlText( "" );
 867     text->SetCDATA( true );
 868     returnNode = text;
 869   }
 870   else if ( StringEqual( p, dtdHeader, false, encoding ) )
 871   {
 872 #ifdef DEBUG_PARSER
 873     TIXML_LOG( "XML parsing Unknown(1)\n" );
 874 #endif
 875     returnNode = new TiXmlUnknown();
 876   }
 877   else if (    IsAlpha( *(p+1), encoding )
 878                || *(p+1) == '_' )
 879   {
 880 #ifdef DEBUG_PARSER
 881     TIXML_LOG( "XML parsing Element\n" );
 882 #endif
 883     returnNode = new TiXmlElement( "" );
 884   }
 885   else
 886   {
 887 #ifdef DEBUG_PARSER
 888     TIXML_LOG( "XML parsing Unknown(2)\n" );
 889 #endif
 890     returnNode = new TiXmlUnknown();
 891   }
 892
 893   if ( returnNode )
 894   {
 895     // Set the parent, so it can report errors
 896     returnNode->parent = this;
 897   }
 898   return returnNode;
 899 }
 900
 901 #ifdef TIXML_USE_STL
 902
 903 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
 904 {
 905   // We're called with some amount of pre-parsing. That is, some of "this"
 906   // element is in "tag". Go ahead and stream to the closing ">"
 907   while( in->good() )
 908   {
 909     int c = in->get();
 910     if ( c <= 0 )
 911     {
 912       TiXmlDocument* document = GetDocument();
 913       if ( document )
 914         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
 915       return;
 916     }
 917     (*tag) += (char) c ;
 918
 919     if ( c == '>' )
 920       break;
 921   }
 922
 923   if ( tag->length() < 3 ) return;
 924
 925   // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
 926   // If not, identify and stream.
 927
 928   if (    tag->at( tag->length() - 1 ) == '>'
 929           && tag->at( tag->length() - 2 ) == '/' )
 930   {
 931     // All good!
 932     return;
 933   }
 934   else if ( tag->at( tag->length() - 1 ) == '>' )
 935   {
 936     // There is more. Could be:
 937     //              text
 938     //              cdata text (which looks like another node)
 939     //              closing tag
 940     //              another node.
 941     for ( ;; )
 942     {
 943       StreamWhiteSpace( in, tag );
 944
 945       // Do we have text?
 946       if ( in->good() && in->peek() != '<' )
 947       {
 948         // Yep, text.
 949         TiXmlText text( "" );
 950         text.StreamIn( in, tag );
 951
 952         // What follows text is a closing tag or another node.
 953         // Go around again and figure it out.
 954         continue;
 955       }
 956
 957       // We now have either a closing tag...or another node.
 958       // We should be at a "<", regardless.
 959       if ( !in->good() ) return;
 960       assert( in->peek() == '<' );
 961       int tagIndex = (int) tag->length();
 962
 963       bool closingTag = false;
 964       bool firstCharFound = false;
 965
 966       for( ;; )
 967       {
 968         if ( !in->good() )
 969           return;
 970
 971         int c = in->peek();
 972         if ( c <= 0 )
 973         {
 974           TiXmlDocument* document = GetDocument();
 975           if ( document )
 976             document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
 977           return;
 978         }
 979
 980         if ( c == '>' )
 981           break;
 982
 983         *tag += (char) c;
 984         in->get();
 985
 986         // Early out if we find the CDATA id.
 987         if ( c == '[' && tag->size() >= 9 )
 988         {
 989           size_t len = tag->size();
 990           const char* start = tag->c_str() + len - 9;
 991           if ( strcmp( start, "<![CDATA[" ) == 0 ) {
 992             assert( !closingTag );
 993             break;
 994           }
 995         }
 996
 997         if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
 998         {
 999           firstCharFound = true;
1000           if ( c == '/' )
1001             closingTag = true;
1002         }
1003       }
1004       // If it was a closing tag, then read in the closing '>' to clean up the input stream.
1005       // If it was not, the streaming will be done by the tag.
1006       if ( closingTag )
1007       {
1008         if ( !in->good() )
1009           return;
1010
1011         int c = in->get();
1012         if ( c <= 0 )
1013         {
1014           TiXmlDocument* document = GetDocument();
1015           if ( document )
1016             document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1017           return;
1018         }
1019         assert( c == '>' );
1020         *tag += (char) c;
1021
1022         // We are done, once we've found our closing tag.
1023         return;
1024       }
1025       else
1026       {
1027         // If not a closing tag, id it, and stream.
1028         const char* tagloc = tag->c_str() + tagIndex;
1029         TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1030         if ( !node )
1031           return;
1032         node->StreamIn( in, tag );
1033         delete node;
1034         node = 0;
1035
1036         // No return: go around from the beginning: text, closing tag, or node.
1037       }
1038     }
1039   }
1040 }
1041 #endif
1042
1043 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1044 {
1045   p = SkipWhiteSpace( p, encoding );
1046   TiXmlDocument* document = GetDocument();
1047
1048   if ( !p || !*p )
1049   {
1050     if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1051     return 0;
1052   }
1053
1054   if ( data )
1055   {
1056     data->Stamp( p, encoding );
1057     location = data->Cursor();
1058   }
1059
1060   if ( *p != '<' )
1061   {
1062     if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1063     return 0;
1064   }
1065
1066   p = SkipWhiteSpace( p+1, encoding );
1067
1068   // Read the name.
1069   const char* pErr = p;
1070
1071   p = ReadName( p, &value, encoding );
1072   if ( !p || !*p )
1073   {
1074     if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1075     return 0;
1076   }
1077
1078   TIXML_STRING endTag ("</");
1079   endTag += value;
1080
1081   // Check for and read attributes. Also look for an empty
1082   // tag or an end tag.
1083   while ( p && *p )
1084   {
1085     pErr = p;
1086     p = SkipWhiteSpace( p, encoding );
1087     if ( !p || !*p )
1088     {
1089       if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1090       return 0;
1091     }
1092     if ( *p == '/' )
1093     {
1094       ++p;
1095       // Empty tag.
1096       if ( *p  != '>' )
1097       {
1098         if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1099         return 0;
1100       }
1101       return (p+1);
1102     }
1103     else if ( *p == '>' )
1104     {
1105       // Done with attributes (if there were any.)
1106       // Read the value -- which can include other
1107       // elements -- read the end tag, and return.
1108       ++p;
1109       p = ReadValue( p, data, encoding );             // Note this is an Element method, and will set the error if one happens.
1110       if ( !p || !*p ) {
1111         // We were looking for the end tag, but found nothing.
1112         // Fix for [ 1663758 ] Failure to report error on bad XML
1113         if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1114         return 0;
1115       }
1116
1117       // We should find the end tag now
1118       // note that:
1119       // </foo > and
1120       // </foo>
1121       // are both valid end tags.
1122       if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1123       {
1124         p += endTag.length();
1125         p = SkipWhiteSpace( p, encoding );
1126         if ( p && *p && *p == '>' ) {
1127           ++p;
1128           return p;
1129         }
1130         if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1131         return 0;
1132       }
1133       else
1134       {
1135         if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1136         return 0;
1137       }
1138     }
1139     else
1140     {
1141       // Try to read an attribute:
1142       TiXmlAttribute* attrib = new TiXmlAttribute();
1143       if ( !attrib )
1144       {
1145         return 0;
1146       }
1147
1148       attrib->SetDocument( document );
1149       pErr = p;
1150       p = attrib->Parse( p, data, encoding );
1151
1152       if ( !p || !*p )
1153       {
1154         if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1155         delete attrib;
1156         return 0;
1157       }
1158
1159       // Handle the strange case of double attributes:
1160 #ifdef TIXML_USE_STL
1161       TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
1162 #else
1163       TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1164 #endif
1165       if ( node )
1166       {
1167         if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1168         delete attrib;
1169         return 0;
1170       }
1171
1172       attributeSet.Add( attrib );
1173     }
1174   }
1175   return p;
1176 }
1177
1178
1179 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1180 {
1181   TiXmlDocument* document = GetDocument();
1182
1183   // Read in text and elements in any order.
1184   const char* pWithWhiteSpace = p;
1185   p = SkipWhiteSpace( p, encoding );
1186
1187   while ( p && *p )
1188   {
1189     if ( *p != '<' )
1190     {
1191       // Take what we have, make a text element.
1192       TiXmlText* textNode = new TiXmlText( "" );
1193
1194       if ( !textNode )
1195       {
1196         return 0;
1197       }
1198
1199       if ( TiXmlBase::IsWhiteSpaceCondensed() )
1200       {
1201         p = textNode->Parse( p, data, encoding );
1202       }
1203       else
1204       {
1205         // Special case: we want to keep the white space
1206         // so that leading spaces aren't removed.
1207         p = textNode->Parse( pWithWhiteSpace, data, encoding );
1208       }
1209
1210       if ( !textNode->Blank() )
1211         LinkEndChild( textNode );
1212       else
1213         delete textNode;
1214     }
1215     else
1216     {
1217       // We hit a '<'
1218       // Have we hit a new element or an end tag? This could also be
1219       // a TiXmlText in the "CDATA" style.
1220       if ( StringEqual( p, "</", false, encoding ) )
1221       {
1222         return p;
1223       }
1224       else
1225       {
1226         TiXmlNode* node = Identify( p, encoding );
1227         if ( node )
1228         {
1229           p = node->Parse( p, data, encoding );
1230           LinkEndChild( node );
1231         }
1232         else
1233         {
1234           return 0;
1235         }
1236       }
1237     }
1238     pWithWhiteSpace = p;
1239     p = SkipWhiteSpace( p, encoding );
1240   }
1241
1242   if ( !p )
1243   {
1244     if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1245   }
1246   return p;
1247 }
1248
1249
1250 #ifdef TIXML_USE_STL
1251 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
1252 {
1253   while ( in->good() )
1254   {
1255     int c = in->get();
1256     if ( c <= 0 )
1257     {
1258       TiXmlDocument* document = GetDocument();
1259       if ( document )
1260         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1261       return;
1262     }
1263     (*tag) += (char) c;
1264
1265     if ( c == '>' )
1266     {
1267       // All is well.
1268       return;
1269     }
1270   }
1271 }
1272 #endif
1273
1274
1275 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1276 {
1277   TiXmlDocument* document = GetDocument();
1278   p = SkipWhiteSpace( p, encoding );
1279
1280   if ( data )
1281   {
1282     data->Stamp( p, encoding );
1283     location = data->Cursor();
1284   }
1285   if ( !p || !*p || *p != '<' )
1286   {
1287     if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1288     return 0;
1289   }
1290   ++p;
1291   value = "";
1292
1293   while ( p && *p && *p != '>' )
1294   {
1295     value += *p;
1296     ++p;
1297   }
1298
1299   if ( !p )
1300   {
1301     if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1302   }
1303   if ( *p == '>' )
1304     return p+1;
1305   return p;
1306 }
1307
1308 #ifdef TIXML_USE_STL
1309 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
1310 {
1311   while ( in->good() )
1312   {
1313     int c = in->get();
1314     if ( c <= 0 )
1315     {
1316       TiXmlDocument* document = GetDocument();
1317       if ( document )
1318         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1319       return;
1320     }
1321
1322     (*tag) += (char) c;
1323
1324     if ( c == '>'
1325          && tag->at( tag->length() - 2 ) == '-'
1326          && tag->at( tag->length() - 3 ) == '-' )
1327     {
1328       // All is well.
1329       return;
1330     }
1331   }
1332 }
1333 #endif
1334
1335
1336 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1337 {
1338   TiXmlDocument* document = GetDocument();
1339   value = "";
1340
1341   p = SkipWhiteSpace( p, encoding );
1342
1343   if ( data )
1344   {
1345     data->Stamp( p, encoding );
1346     location = data->Cursor();
1347   }
1348   const char* startTag = "<!--";
1349   const char* endTag   = "-->";
1350
1351   if ( !StringEqual( p, startTag, false, encoding ) )
1352   {
1353     document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1354     return 0;
1355   }
1356   p += strlen( startTag );
1357
1358   // [ 1475201 ] TinyXML parses entities in comments
1359   // Oops - ReadText doesn't work, because we don't want to parse the entities.
1360   // p = ReadText( p, &value, false, endTag, false, encoding );
1361   //
1362   // from the XML spec:
1363   /*
1364     [Definition: Comments may appear anywhere in a document outside other markup; in addition,
1365     they may appear within the document type declaration at places allowed by the grammar.
1366     They are not part of the document's character data; an XML processor MAY, but need not,
1367     make it possible for an application to retrieve the text of comments. For compatibility,
1368     the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
1369     references MUST NOT be recognized within comments.
1370
1371     An example of a comment:
1372
1373     <!-- declarations for <head> & <body> -->
1374   */
1375
1376   value = "";
1377   // Keep all the white space.
1378   while ( p && *p && !StringEqual( p, endTag, false, encoding ) )
1379   {
1380     value.append( p, 1 );
1381     ++p;
1382   }
1383   if ( p && *p )
1384     p += strlen( endTag );
1385
1386   return p;
1387 }
1388
1389
1390 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1391 {
1392   p = SkipWhiteSpace( p, encoding );
1393   if ( !p || !*p ) return 0;
1394
1395   if ( data )
1396   {
1397     data->Stamp( p, encoding );
1398     location = data->Cursor();
1399   }
1400   // Read the name, the '=' and the value.
1401   const char* pErr = p;
1402   p = ReadName( p, &name, encoding );
1403   if ( !p || !*p )
1404   {
1405     if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1406     return 0;
1407   }
1408   p = SkipWhiteSpace( p, encoding );
1409   if ( !p || !*p || *p != '=' )
1410   {
1411     if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1412     return 0;
1413   }
1414
1415   ++p;    // skip '='
1416   p = SkipWhiteSpace( p, encoding );
1417   if ( !p || !*p )
1418   {
1419     if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1420     return 0;
1421   }
1422
1423   const char* end;
1424   const char SINGLE_QUOTE = '\'';
1425   const char DOUBLE_QUOTE = '\"';
1426
1427   if ( *p == SINGLE_QUOTE )
1428   {
1429     ++p;
1430     end = "\'";             // single quote in string
1431     p = ReadText( p, &value, false, end, false, encoding );
1432   }
1433   else if ( *p == DOUBLE_QUOTE )
1434   {
1435     ++p;
1436     end = "\"";             // double quote in string
1437     p = ReadText( p, &value, false, end, false, encoding );
1438   }
1439   else
1440   {
1441     // All attribute values should be in single or double quotes.
1442     // But this is such a common error that the parser will try
1443     // its best, even without them.
1444     value = "";
1445     while (    p && *p                                                                                      // existence
1446                && !IsWhiteSpace( *p )                                                          // whitespace
1447                && *p != '/' && *p != '>' )                                                     // tag end
1448     {
1449       if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
1450         // [ 1451649 ] Attribute values with trailing quotes not handled correctly
1451         // We did not have an opening quote but seem to have a
1452         // closing one. Give up and throw an error.
1453         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1454         return 0;
1455       }
1456       value += *p;
1457       ++p;
1458     }
1459   }
1460   return p;
1461 }
1462
1463 #ifdef TIXML_USE_STL
1464 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
1465 {
1466   while ( in->good() )
1467   {
1468     int c = in->peek();
1469     if ( !cdata && (c == '<' ) )
1470     {
1471       return;
1472     }
1473     if ( c <= 0 )
1474     {
1475       TiXmlDocument* document = GetDocument();
1476       if ( document )
1477         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1478       return;
1479     }
1480
1481     (*tag) += (char) c;
1482     in->get();      // "commits" the peek made above
1483
1484     if ( cdata && c == '>' && tag->size() >= 3 ) {
1485       size_t len = tag->size();
1486       if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
1487         // terminator of cdata.
1488         return;
1489       }
1490     }
1491   }
1492 }
1493 #endif
1494
1495 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1496 {
1497   value = "";
1498   TiXmlDocument* document = GetDocument();
1499
1500   if ( data )
1501   {
1502     data->Stamp( p, encoding );
1503     location = data->Cursor();
1504   }
1505
1506   const char* const startTag = "<![CDATA[";
1507   const char* const endTag   = "]]>";
1508
1509   if ( cdata || StringEqual( p, startTag, false, encoding ) )
1510   {
1511     cdata = true;
1512
1513     if ( !StringEqual( p, startTag, false, encoding ) )
1514     {
1515       document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1516       return 0;
1517     }
1518     p += strlen( startTag );
1519
1520     // Keep all the white space, ignore the encoding, etc.
1521     while (    p && *p
1522                && !StringEqual( p, endTag, false, encoding )
1523       )
1524     {
1525       value += *p;
1526       ++p;
1527     }
1528
1529     TIXML_STRING dummy;
1530     p = ReadText( p, &dummy, false, endTag, false, encoding );
1531     return p;
1532   }
1533   else
1534   {
1535     bool ignoreWhite = true;
1536
1537     const char* end = "<";
1538     p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1539     if ( p )
1540       return p-1;     // don't truncate the '<'
1541     return 0;
1542   }
1543 }
1544
1545 #ifdef TIXML_USE_STL
1546 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
1547 {
1548   while ( in->good() )
1549   {
1550     int c = in->get();
1551     if ( c <= 0 )
1552     {
1553       TiXmlDocument* document = GetDocument();
1554       if ( document )
1555         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1556       return;
1557     }
1558     (*tag) += (char) c;
1559
1560     if ( c == '>' )
1561     {
1562       // All is well.
1563       return;
1564     }
1565   }
1566 }
1567 #endif
1568
1569 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1570 {
1571   p = SkipWhiteSpace( p, _encoding );
1572   // Find the beginning, find the end, and look for
1573   // the stuff in-between.
1574   TiXmlDocument* document = GetDocument();
1575   if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1576   {
1577     if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1578     return 0;
1579   }
1580   if ( data )
1581   {
1582     data->Stamp( p, _encoding );
1583     location = data->Cursor();
1584   }
1585   p += 5;
1586
1587   version = "";
1588   encoding = "";
1589   standalone = "";
1590
1591   while ( p && *p )
1592   {
1593     if ( *p == '>' )
1594     {
1595       ++p;
1596       return p;
1597     }
1598
1599     p = SkipWhiteSpace( p, _encoding );
1600     if ( StringEqual( p, "version", true, _encoding ) )
1601     {
1602       TiXmlAttribute attrib;
1603       p = attrib.Parse( p, data, _encoding );
1604       version = attrib.Value();
1605     }
1606     else if ( StringEqual( p, "encoding", true, _encoding ) )
1607     {
1608       TiXmlAttribute attrib;
1609       p = attrib.Parse( p, data, _encoding );
1610       encoding = attrib.Value();
1611     }
1612     else if ( StringEqual( p, "standalone", true, _encoding ) )
1613     {
1614       TiXmlAttribute attrib;
1615       p = attrib.Parse( p, data, _encoding );
1616       standalone = attrib.Value();
1617     }
1618     else
1619     {
1620       // Read over whatever it is.
1621       while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1622         ++p;
1623     }
1624   }
1625   return 0;
1626 }
1627
1628 bool TiXmlText::Blank() const
1629 {
1630   for ( unsigned i=0; i<value.length(); i++ )
1631     if ( !IsWhiteSpace( value[i] ) )
1632       return false;
1633   return true;
1634 }