A port of the irrlicht XML parser library.

Revision:
0:41a49a73580c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CXMLReaderImpl.h	Wed Nov 17 20:19:41 2010 +0000
@@ -0,0 +1,797 @@
+// Copyright (C) 2002-2005 Nikolaus Gebhardt
+// This file is part of the "Irrlicht Engine" and the "irrXML" project.
+// For conditions of distribution and use, see copyright notice in irrlicht.h and/or irrXML.h
+
+#ifndef __ICXML_READER_IMPL_H_INCLUDED__
+#define __ICXML_READER_IMPL_H_INCLUDED__
+
+#include "irrXML.h"
+#include "irrString.h"
+#include "irrArray.h"
+#include "fast_atof.h"
+
+#ifdef _DEBUG
+#define IRR_DEBUGPRINT(x) printf((x));
+#else // _DEBUG 
+#define IRR_DEBUGPRINT(x)
+#endif // _DEBUG
+
+
+namespace irr
+{
+namespace io
+{
+
+
+//! implementation of the IrrXMLReader
+template<class char_type, class superclass>
+class CXMLReaderImpl : public IIrrXMLReader<char_type, superclass>
+{
+public:
+
+	//! Constructor
+	CXMLReaderImpl(IFileReadCallBack* callback, bool deleteCallBack = true)
+		: TextData(0), P(0), TextSize(0), TextBegin(0), CurrentNodeType(EXN_NONE),
+		SourceFormat(ETF_ASCII), TargetFormat(ETF_ASCII)
+	{
+		if (!callback)
+			return;
+
+		storeTargetFormat();
+
+		// read whole xml file
+
+		readFile(callback);
+		
+		// clean up
+
+		if (deleteCallBack)
+			delete callback;
+
+		// create list with special characters
+
+		createSpecialCharacterList();
+
+		// set pointer to text begin
+		P = TextBegin;
+	}
+    	
+
+	//! Destructor
+	virtual ~CXMLReaderImpl()
+	{
+		delete [] TextData;
+	}
+
+
+	//! Reads forward to the next xml node. 
+	//! \return Returns false, if there was no further node. 
+	virtual bool read()
+	{
+		// if not end reached, parse the node
+		if (P && (unsigned int)(P - TextBegin) < TextSize - 1 && *P != 0)
+		{
+			parseCurrentNode();
+			return true;
+		}
+
+		_IRR_IMPLEMENT_MANAGED_MARSHALLING_BUGFIX;
+		return false;
+	}
+
+
+	//! Returns the type of the current XML node.
+	virtual EXML_NODE getNodeType() const
+	{
+		return CurrentNodeType;
+	}
+
+
+	//! Returns attribute count of the current XML node.
+	virtual int getAttributeCount() const
+	{
+		return Attributes.size();
+	}
+
+
+	//! Returns name of an attribute.
+	virtual const char_type* getAttributeName(int idx) const
+	{
+		if (idx < 0 || idx >= (int)Attributes.size())
+			return 0;
+
+		return Attributes[idx].Name.c_str();
+	}
+
+
+	//! Returns the value of an attribute. 
+	virtual const char_type* getAttributeValue(int idx) const
+	{
+		if (idx < 0 || idx >= (int)Attributes.size())
+			return 0;
+
+		return Attributes[idx].Value.c_str();
+	}
+
+
+	//! Returns the value of an attribute. 
+	virtual const char_type* getAttributeValue(const char_type* name) const
+	{
+		const SAttribute* attr = getAttributeByName(name);
+		if (!attr)
+			return 0;
+
+		return attr->Value.c_str();
+	}
+
+
+	//! Returns the value of an attribute
+	virtual const char_type* getAttributeValueSafe(const char_type* name) const
+	{
+		const SAttribute* attr = getAttributeByName(name);
+		if (!attr)
+			return EmptyString.c_str();
+
+		return attr->Value.c_str();
+	}
+
+
+
+	//! Returns the value of an attribute as integer. 
+	virtual int getAttributeValueAsInt(const char_type* name) const
+	{
+		return (int)getAttributeValueAsFloat(name);
+	}
+
+
+	//! Returns the value of an attribute as integer. 
+    virtual int getAttributeValueAsInt(int idx) const
+	{
+		return (int)getAttributeValueAsFloat(idx);
+	}
+
+
+	//! Returns the value of an attribute as float. 
+	virtual float getAttributeValueAsFloat(const char_type* name) const
+	{
+		const SAttribute* attr = getAttributeByName(name);
+		if (!attr)
+			return 0;
+
+		core::stringc c = attr->Value.c_str();
+		return core::fast_atof(c.c_str());
+	}
+
+
+	//! Returns the value of an attribute as float. 
+	virtual float getAttributeValueAsFloat(int idx) const
+	{
+		const char_type* attrvalue = getAttributeValue(idx);
+		if (!attrvalue)
+			return 0;
+
+		core::stringc c = attrvalue;
+		return core::fast_atof(c.c_str());
+	}
+
+
+	//! Returns the name of the current node.
+	virtual const char_type* getNodeName() const
+	{
+		return NodeName.c_str();
+	}
+
+
+	//! Returns data of the current node.
+	virtual const char_type* getNodeData() const
+	{
+		return NodeName.c_str();
+	}
+
+
+	//! Returns if an element is an empty element, like <foo />
+	virtual bool isEmptyElement() const
+	{
+		return IsEmptyElement;
+	}
+
+	//! Returns format of the source xml file.
+	virtual ETEXT_FORMAT getSourceFormat() const
+	{
+		return SourceFormat;
+	}
+
+	//! Returns format of the strings returned by the parser.
+	virtual ETEXT_FORMAT getParserFormat() const
+	{
+		return TargetFormat;
+	}
+
+private:
+
+	// Reads the current xml node
+	void parseCurrentNode()
+	{
+		char_type* start = P;
+
+		// more forward until '<' found
+		while(*P != L'<' && *P)
+			++P;
+
+		if (!*P)
+			return;
+
+		if (P - start > 0)
+		{
+			// we found some text, store it
+			if (setText(start, P))
+				return;
+		}
+
+		++P;
+
+		// based on current token, parse and report next element
+		switch(*P)
+		{
+		case L'/':
+			parseClosingXMLElement(); 
+			break;
+		case L'?':
+			ignoreDefinition();	
+			break;
+		case L'!':
+			if (!parseCDATA())
+				parseComment();	
+			break;
+		default:
+			parseOpeningXMLElement();
+			break;
+		}
+	}
+
+
+	//! sets the state that text was found. Returns true if set should be set
+	bool setText(char_type* start, char_type* end)
+	{
+		// check if text is more than 2 characters, and if not, check if there is 
+		// only white space, so that this text won't be reported
+		if (end - start < 3)
+		{
+			char_type* p = start;
+			for(; p != end; ++p)
+				if (!isWhiteSpace(*p))
+					break;
+
+			if (p == end)
+				return false;
+		}
+
+		// set current text to the parsed text, and replace xml special characters
+		core::string<char_type> s(start, (int)(end - start));
+		NodeName = replaceSpecialCharacters(s);
+
+		// current XML node type is text
+		CurrentNodeType = EXN_TEXT;
+
+		return true;
+	}
+
+
+
+	//! ignores an xml definition like <?xml something />
+	void ignoreDefinition()
+	{
+		CurrentNodeType = EXN_UNKNOWN;
+
+		// move until end marked with '>' reached
+		while(*P != L'>')
+			++P;
+
+		++P;
+	}
+
+
+	//! parses a comment
+	void parseComment()
+	{
+		CurrentNodeType = EXN_COMMENT;
+		P += 1;
+
+		char_type *pCommentBegin = P;
+
+		int count = 1;
+
+		// move until end of comment reached
+		while(count)
+		{
+			if (*P == L'>')
+				--count;
+			else
+			if (*P == L'<')
+				++count;
+
+			++P;
+		}
+
+		P -= 3;
+		NodeName = core::string<char_type>(pCommentBegin+2, (int)(P - pCommentBegin-2));
+		P += 3;
+	}
+
+
+	//! parses an opening xml element and reads attributes
+	void parseOpeningXMLElement()
+	{
+		CurrentNodeType = EXN_ELEMENT;
+		IsEmptyElement = false;
+		Attributes.clear();
+
+		// find name
+		const char_type* startName = P;
+
+		// find end of element
+		while(*P != L'>' && !isWhiteSpace(*P))
+			++P;
+
+		const char_type* endName = P;
+
+		// find Attributes
+		while(*P != L'>')
+		{
+			if (isWhiteSpace(*P))
+				++P;
+			else
+			{
+				if (*P != L'/')
+				{
+					// we've got an attribute
+
+					// read the attribute names
+					const char_type* attributeNameBegin = P;
+
+					while(!isWhiteSpace(*P) && *P != L'=')
+						++P;
+
+					const char_type* attributeNameEnd = P;
+					++P;
+
+					// read the attribute value
+					// check for quotes and single quotes, thx to murphy
+					while( (*P != L'\"') && (*P != L'\'') && *P) 
+						++P;
+
+					if (!*P) // malformatted xml file
+						return;
+
+					const char_type attributeQuoteChar = *P;
+
+					++P;
+					const char_type* attributeValueBegin = P;
+					
+					while(*P != attributeQuoteChar && *P)
+						++P;
+
+					if (!*P) // malformatted xml file
+						return;
+
+					const char_type* attributeValueEnd = P;
+					++P;
+
+					SAttribute attr;
+					attr.Name = core::string<char_type>(attributeNameBegin, 
+						(int)(attributeNameEnd - attributeNameBegin));
+
+					core::string<char_type> s(attributeValueBegin, 
+						(int)(attributeValueEnd - attributeValueBegin));
+
+					attr.Value = replaceSpecialCharacters(s);
+					Attributes.push_back(attr);
+				}
+				else
+				{
+					// tag is closed directly
+					++P;
+					IsEmptyElement = true;
+					break;
+				}
+			}
+		}
+
+		// check if this tag is closing directly
+		if (endName > startName && *(endName-1) == L'/')
+		{
+			// directly closing tag
+			IsEmptyElement = true;
+			endName--;
+		}
+		
+		NodeName = core::string<char_type>(startName, (int)(endName - startName));
+
+		++P;
+	}
+
+
+	//! parses an closing xml tag
+	void parseClosingXMLElement()
+	{
+		CurrentNodeType = EXN_ELEMENT_END;
+		IsEmptyElement = false;
+		Attributes.clear();
+
+		++P;
+		const char_type* pBeginClose = P;
+
+		while(*P != L'>')
+			++P;
+
+		NodeName = core::string<char_type>(pBeginClose, (int)(P - pBeginClose));
+		++P;
+	}
+
+	//! parses a possible CDATA section, returns false if begin was not a CDATA section
+	bool parseCDATA()
+	{
+		if (*(P+1) != L'[')
+			return false;
+
+		CurrentNodeType = EXN_CDATA;
+
+		// skip '<![CDATA['
+		int count=0;
+		while( *P && count<8 )
+		{
+			++P;
+			++count;
+		}
+
+		if (!*P)
+			return true;
+
+		char_type *cDataBegin = P;
+		char_type *cDataEnd = 0;
+
+		// find end of CDATA
+		while(*P && !cDataEnd)
+		{
+			if (*P == L'>' && 
+			   (*(P-1) == L']') &&
+			   (*(P-2) == L']'))
+			{
+				cDataEnd = P - 2;
+			}
+
+			++P;
+		}
+
+		if ( cDataEnd )
+			NodeName = core::string<char_type>(cDataBegin, (int)(cDataEnd - cDataBegin));
+		else
+			NodeName = "";
+
+		return true;
+	}
+
+
+	// structure for storing attribute-name pairs
+	struct SAttribute
+	{
+		core::string<char_type> Name;
+		core::string<char_type> Value;
+	};
+
+	// finds a current attribute by name, returns 0 if not found
+	const SAttribute* getAttributeByName(const char_type* name) const
+	{
+		if (!name)
+			return 0;
+
+		core::string<char_type> n = name;
+
+		for (int i=0; i<(int)Attributes.size(); ++i)
+			if (Attributes[i].Name == n)
+				return &Attributes[i];
+
+		return 0;
+	}
+
+	// replaces xml special characters in a string and creates a new one
+	core::string<char_type> replaceSpecialCharacters(
+		core::string<char_type>& origstr)
+	{
+		int pos = origstr.findFirst(L'&');
+		int oldPos = 0;
+
+		if (pos == -1)
+			return origstr;
+
+		core::string<char_type> newstr;
+
+		while(pos != -1 && pos < origstr.size()-2)
+		{
+			// check if it is one of the special characters
+
+			int specialChar = -1;
+			for (int i=0; i<(int)SpecialCharacters.size(); ++i)
+			{
+				const char_type* p = &origstr.c_str()[pos]+1;
+
+				if (equalsn(&SpecialCharacters[i][1], p, SpecialCharacters[i].size()-1))
+				{
+					specialChar = i;
+					break;
+				}
+			}
+
+			if (specialChar != -1)
+			{
+				newstr.append(origstr.subString(oldPos, pos - oldPos));
+				newstr.append(SpecialCharacters[specialChar][0]);
+				pos += SpecialCharacters[specialChar].size();
+			}
+			else
+			{
+				newstr.append(origstr.subString(oldPos, pos - oldPos + 1));
+				pos += 1;
+			}
+
+			// find next &
+			oldPos = pos;
+			pos = origstr.findNext(L'&', pos);		
+		}
+
+		if (oldPos < origstr.size()-1)
+			newstr.append(origstr.subString(oldPos, origstr.size()-oldPos));
+
+		return newstr;
+	}
+
+
+
+	//! reads the xml file and converts it into the wanted character format.
+	bool readFile(IFileReadCallBack* callback)
+	{
+		int size = callback->getSize();		
+		size += 4; // We need two terminating 0's at the end.
+		           // For ASCII we need 1 0's, for UTF-16 2, for UTF-32 4.
+
+		char* data8 = new char[size];
+
+		if (!callback->read(data8, size-4))
+		{
+			delete [] data8;
+			return false;
+		}
+
+		// add zeros at end
+
+		data8[size-1] = 0;
+		data8[size-2] = 0;
+		data8[size-3] = 0;
+		data8[size-4] = 0;
+
+		char16* data16 = reinterpret_cast<char16*>(data8);
+		char32* data32 = reinterpret_cast<char32*>(data8);	
+
+		// now we need to convert the data to the desired target format
+		// based on the byte order mark.
+
+		const unsigned char UTF8[] = {0xEF, 0xBB, 0xBF}; // 0xEFBBBF;
+		const int UTF16_BE = 0xFFFE;
+		const int UTF16_LE = 0xFEFF;
+		const int UTF32_BE = 0xFFFE0000;
+		const int UTF32_LE = 0x0000FEFF;
+
+		// check source for all utf versions and convert to target data format
+		
+		if (size >= 4 && data32[0] == (char32)UTF32_BE)
+		{
+			// UTF-32, big endian
+			SourceFormat = ETF_UTF32_BE;
+			convertTextData(data32+1, data8, (size/4)); // data32+1 because we need to skip the header
+		}
+		else
+		if (size >= 4 && data32[0] == (char32)UTF32_LE)
+		{
+			// UTF-32, little endian
+			SourceFormat = ETF_UTF32_LE;
+			convertTextData(data32+1, data8, (size/4)); // data32+1 because we need to skip the header
+		}
+		else
+		if (size >= 2 && data16[0] == UTF16_BE)
+		{
+			// UTF-16, big endian
+			SourceFormat = ETF_UTF16_BE;
+			convertTextData(data16+1, data8, (size/2)); // data16+1 because we need to skip the header
+		}
+		else
+		if (size >= 2 && data16[0] == UTF16_LE)
+		{
+			// UTF-16, little endian
+			SourceFormat = ETF_UTF16_LE;
+			convertTextData(data16+1, data8, (size/2)); // data16+1 because we need to skip the header
+		}
+		else
+		if (size >= 3 && data8[0] == UTF8[0] && data8[1] == UTF8[1] && data8[2] == UTF8[2])
+		{
+			// UTF-8
+			SourceFormat = ETF_UTF8;
+			convertTextData(data8+3, data8, size); // data8+3 because we need to skip the header
+		}
+		else
+		{
+			// ASCII
+			SourceFormat = ETF_ASCII;
+			convertTextData(data8, data8, size);
+		}
+
+		return true;
+	}
+
+
+	//! converts the text file into the desired format.
+	//! \param source: begin of the text (without byte order mark)
+	//! \param pointerToStore: pointer to text data block which can be
+	//! stored or deleted based on the nesessary conversion.
+	//! \param sizeWithoutHeader: Text size in characters without header
+	template<class src_char_type>
+	void convertTextData(src_char_type* source, char* pointerToStore, int sizeWithoutHeader)
+	{
+		// convert little to big endian if necessary
+		if (sizeof(src_char_type) > 1 && 
+			isLittleEndian(TargetFormat) != isLittleEndian(SourceFormat))
+			convertToLittleEndian(source);
+
+		// check if conversion is necessary:
+		if (sizeof(src_char_type) == sizeof(char_type))
+		{
+			// no need to convert
+			TextBegin = (char_type*)source;
+			TextData = (char_type*)pointerToStore;
+			TextSize = sizeWithoutHeader;
+		}
+		else
+		{
+			// convert source into target data format. 
+			// TODO: implement a real conversion. This one just 
+			// copies bytes. This is a problem when there are 
+			// unicode symbols using more than one character.
+
+			TextData = new char_type[sizeWithoutHeader];
+
+			for (int i=0; i<sizeWithoutHeader; ++i)
+				TextData[i] = (char_type)source[i];
+
+			TextBegin = TextData;
+			TextSize = sizeWithoutHeader;
+
+			// delete original data because no longer needed
+			delete [] pointerToStore;
+		}
+	}
+
+	//! converts whole text buffer to little endian
+	template<class src_char_type>
+	void convertToLittleEndian(src_char_type* t)
+	{
+		if (sizeof(src_char_type) == 4) 
+		{
+			// 32 bit
+
+			while(*t)
+			{
+				*t = ((*t & 0xff000000) >> 24) |
+				     ((*t & 0x00ff0000) >> 8)  |
+				     ((*t & 0x0000ff00) << 8)  |
+				     ((*t & 0x000000ff) << 24);
+				++t;
+			}
+		}
+		else
+		{
+			// 16 bit 
+
+			while(*t)
+			{
+				*t = (*t >> 8) | (*t << 8);
+				++t;
+			}
+		}
+	}
+
+	//! returns if a format is little endian
+	inline bool isLittleEndian(ETEXT_FORMAT f)
+	{
+		return f == ETF_ASCII ||
+		       f == ETF_UTF8 ||
+		       f == ETF_UTF16_LE ||
+		       f == ETF_UTF32_LE;
+	}
+
+
+	//! returns true if a character is whitespace
+	inline bool isWhiteSpace(char_type c)
+	{
+		return (c==' ' || c=='\t' || c=='\n' || c=='\r');
+	}
+
+
+	//! generates a list with xml special characters
+	void createSpecialCharacterList()
+	{
+		// list of strings containing special symbols, 
+		// the first character is the special character,
+		// the following is the symbol string without trailing &.
+
+		SpecialCharacters.push_back("&amp;");
+		SpecialCharacters.push_back("<lt;");
+		SpecialCharacters.push_back(">gt;");
+		SpecialCharacters.push_back("\"quot;");
+		SpecialCharacters.push_back("'apos;");
+		
+	}
+
+
+	//! compares the first n characters of the strings
+	bool equalsn(const char_type* str1, const char_type* str2, int len)
+	{
+		int i;
+		for(i=0; str1[i] && str2[i] && i < len; ++i)
+			if (str1[i] != str2[i])
+				return false;
+
+		// if one (or both) of the strings was smaller then they
+		// are only equal if they have the same lenght
+		return (i == len) || (str1[i] == 0 && str2[i] == 0);
+	}
+
+
+	//! stores the target text format
+	void storeTargetFormat()
+	{
+		// get target format. We could have done this using template specialization,
+		// but VisualStudio 6 don't like it and we want to support it.
+
+		switch(sizeof(char_type))
+		{
+		case 1: 
+			TargetFormat = ETF_UTF8;
+			break;
+		case 2: 
+			TargetFormat = ETF_UTF16_LE;
+			break;
+		case 4: 
+			TargetFormat = ETF_UTF32_LE;
+			break;
+		default:
+			TargetFormat = ETF_ASCII; // should never happen.
+		}
+	}
+
+
+	// instance variables:
+
+	char_type* TextData;         // data block of the text file
+	char_type* P;                // current point in text to parse
+	char_type* TextBegin;        // start of text to parse
+	unsigned int TextSize;       // size of text to parse in characters, not bytes
+
+	EXML_NODE CurrentNodeType;   // type of the currently parsed node
+	ETEXT_FORMAT SourceFormat;   // source format of the xml file
+	ETEXT_FORMAT TargetFormat;   // output format of this parser
+
+	core::string<char_type> NodeName;    // name of the node currently in
+	core::string<char_type> EmptyString; // empty string to be returned by getSafe() methods
+
+	bool IsEmptyElement;       // is the currently parsed node empty?
+
+	core::array< core::string<char_type> > SpecialCharacters; // see createSpecialCharacterList()
+
+	core::array<SAttribute> Attributes; // attributes of current element
+	
+}; // end CXMLReaderImpl
+
+
+} // end namespace
+} // end namespace
+
+#endif