SAX based XML parser

Dependents:   giken9_HTMLServer_Temp_Sample

Revision:
0:07919e3d6c56
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/expatpp.h	Fri Apr 08 09:18:41 2011 +0000
@@ -0,0 +1,339 @@
+// expatpp
+#ifndef H_EXPATPP
+#define H_EXPATPP
+
+#ifdef EXPATPP_COMPATIBLE_EXPAT12 // earlier versions of expat up to v1.2 
+    #include "xmlparse.h"
+#else
+    #include "expat.h"  // since some version of expat moved to SourceForge
+#endif
+#include <stdio.h>
+#include <assert.h>
+
+
+/**
+\file expatpp.h
+Latest version 29-Dec-2002 compatible with expat 1.95.6
+*/
+
+/**
+expatpp follows a simple pattern for converting the semi-OOP callback design of 
+expat into a true class which allows you to override virtual methods to supply
+callbacks.
+
+\par USING expatpp
+see testexpatpp.cpp for a detailed example
+
+1) decide which callbacks you wish to use, eg: just startElement
+
+2) declare a subclass of expatpp, eg:
+class myExpat : public expatpp {
+    virtual void startElement(const XML_Char* name, const XML_Char** atts);
+};
+
+3) create an instance of your object and pass in a buffer to parse
+myExpat parser;
+parser.XML_Parse(buf, len, done)
+
+
+\par HOW IT WORKS
+The User Data which expat maintains is simply a pointer to an instance of your object.
+
+Inline static functions are specified as the callbacks to expat.
+These static functions take the user data parameter returned from expat and cast it
+to a pointer to an expatpp object.
+
+Using that typed pointer they then call the appropriate virtual method.
+
+If you have overriden a given virtual method then your version will be called, otherwise
+the (empty) method in the base expatpp class is called.
+
+\par Possible Efficiency Tactic
+For efficiency, you could provide your own constructor and set some of the callbacks
+to 0, so expat doesn't call the static functions. (untested idea).
+
+\par Naming Conventions
+The virtual functions violate the usual AD Software convention of lowercase first letter 
+for public methods but this was a late change to protected and too much user code out there.
+
+
+\todo Possibly implement some handling for XML_SetExternalEntityRefHandler which does NOT
+receive user data, just the parser, so can't use normal pattern for invoking virtual methods
+
+\todo Possibly implement handling for XML_UnknownEncodingHandler.
+
+\todo review design for nested calls - not happy that it is the right thing that they don't see
+their start and ending elements - makes it harder to unit test them in isolation.
+
+\todo unit tests
+
+\todo especially test abort mechanism
+
+\todo reinstate copy constrution and assignment with child parser cleanup
+
+\todo allow specification of encoding
+*/
+class expatpp {
+public:
+    expatpp(bool createParser=true);
+    virtual ~expatpp();
+
+    operator XML_Parser() const;
+    
+protected:  // callback virtuals should only be invoked through our Callback static functions
+    bool emptyCharData(const XML_Char* s, int len);  // utility often used in overridden charData
+
+// overrideable callbacks
+    virtual void startElement(const XML_Char* name, const XML_Char** atts);
+    virtual void endElement(const XML_Char*);
+    virtual void charData(const XML_Char*, int len);
+    virtual void processingInstruction(const XML_Char* target, const XML_Char* data);
+    virtual void defaultHandler(const XML_Char*, int len);
+    virtual int notStandaloneHandler();
+    virtual void unparsedEntityDecl(const XML_Char* entityName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId, const XML_Char* notationName);
+    virtual void notationDecl(const XML_Char* notationName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId);
+    virtual void startNamespace(const XML_Char* prefix, const XML_Char* uri);
+    virtual void endNamespace(const XML_Char*);
+/// \name Callbacks added to support expat 1.95.5
+//@{
+    virtual void attlistDecl( 
+        const XML_Char *elname,
+        const XML_Char *attname,
+        const XML_Char *att_type,
+        const XML_Char *dflt,
+        int             isrequired);    
+    virtual void endCdataSection();    
+    virtual void endDoctypeDecl();    
+    virtual void comment( const XML_Char *data);    
+    virtual void elementDecl( const XML_Char *name, XML_Content *model);    
+    virtual void entityDecl(
+        const XML_Char *entityName,
+        int is_parameter_entity,
+        const XML_Char *value,
+        int value_length,
+        const XML_Char *base,
+        const XML_Char *systemId,
+        const XML_Char *publicId,
+        const XML_Char *notationName);    
+    virtual void skippedEntity(const XML_Char *entityName, int is_parameter_entity);    
+    virtual void startCdataSection();
+    virtual void startDoctypeDecl(const XML_Char *doctypeName,
+        const XML_Char *sysid,
+        const XML_Char *pubid,
+        int has_internal_subset);    
+    virtual void xmlDecl( const XML_Char      *version,
+                                        const XML_Char      *encoding,
+                                        int                  standalone);
+//@}
+
+public:    
+/// \name XML interfaces
+//@{
+    XML_Status XML_Parse(const char* buffer, int len, int isFinal);
+    virtual XML_Status  parseFile(FILE* inFile);
+    virtual XML_Status  parseString(const char*);
+    XML_Error XML_GetErrorCode();
+    int XML_GetCurrentLineNumber();
+    int XML_GetCurrentColumnNumber();
+//@}
+    
+protected:
+    XML_Parser mParser;
+    bool mHaveParsed;
+    
+/// \name overrideables to customise behaviour, must call parent
+//@{
+    virtual void ReleaseParser();
+    virtual void ResetParser();
+    virtual void SetupHandlers();
+//@}
+
+/**
+    Override so subclass can react to an error causing exit from parse.
+    rather than leave it for application code to check status.
+    Useful point to insert logging to silently grab failed parses
+*/
+    virtual void CheckFinalStatus(XML_Status) {};
+        
+// static interface functions for callbacks
+public:
+    static void startElementCallback(void *userData, const XML_Char* name, const XML_Char** atts);
+    static void endElementCallback(void *userData, const XML_Char* name);
+    static void startNamespaceCallback(void *userData, const XML_Char* prefix, const XML_Char* uri);
+    static void endNamespaceCallback(void *userData, const XML_Char* prefix);
+    static void charDataCallback(void *userData, const XML_Char* s, int len);
+    static void processingInstructionCallback(void *userData, const XML_Char* target, const XML_Char* data);
+    static void defaultHandlerCallback(void* userData, const XML_Char* s, int len);
+    static int notStandaloneHandlerCallback(void* userData);    
+    static void unParsedEntityDeclCallback(void* userData, const XML_Char* entityName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId, const XML_Char* notationName);
+    static void notationDeclCallback(void *userData, const XML_Char* notationName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId);
+/// \name Callback interfacess added to support expat 1.95.5
+//@{
+    static void attlistDeclCallback(void *userData,  
+        const XML_Char *elname,
+        const XML_Char *attname,
+        const XML_Char *att_type,
+        const XML_Char *dflt,
+        int             isrequired);    
+    static void commentCallback(void *userData,  const XML_Char *data);    
+    static void elementDeclCallback(void *userData,  const XML_Char *name, XML_Content *model);    
+    static void endCdataSectionCallback(void *userData);
+    static void endDoctypeDeclCallback(void *userData);    
+    static void entityDeclCallback(void *userData, 
+        const XML_Char *entityName,
+        int is_parameter_entity,
+        const XML_Char *value,
+        int value_length,
+        const XML_Char *base,
+        const XML_Char *systemId,
+        const XML_Char *publicId,
+        const XML_Char *notationName);    
+    static void skippedEntityCallback(void *userData,  const XML_Char *entityName, int is_parameter_entity);    
+    static void startCdataSectionCallback(void *userData);
+    static void startDoctypeDeclCallback(void *userData, 
+        const XML_Char *doctypeName,
+        const XML_Char *sysid,
+        const XML_Char *pubid,
+        int has_internal_subset);    
+    static void xmlDeclCallback(void *userData,  const XML_Char      *version,
+                                        const XML_Char      *encoding,
+                                        int                  standalone);
+//@}
+
+    
+// utilities
+    static int skipWhiteSpace(const XML_Char*);
+    static const XML_Char* getAttribute(const XML_Char *matchingName, const XML_Char **atts);
+    static bool getIntegerAttribute(const XML_Char *matchingName, const XML_Char **atts, int& outAtt);
+    static bool getDoubleAttribute(const XML_Char *matchingName, const XML_Char **atts, double& outAtt);
+};
+
+
+/**
+    subclass to support a hierarchy of parsers, in a sort of recursion or
+    'nesting' approach, where a top-level parser might create sub-parsers 
+    for part of a file.
+    
+    The currently active child parser is owned (mOwnedChild) and is deleted
+    by DeleteChild (invoked from the dtor) so error handling can propagate 
+    up the tree, closing parsers, without leaks.
+
+    \par Switching to sub-parsers
+    You can transfer to a sub-parser with
+    - new UserChildParser(this)  // carries on using our parser, is self-deleting
+    - switchToNewSubParser( someVar = new UserChildParser(this) )  // if want to get values back after end parsing
+
+    \warning You can accidentally invoke a new parser without it doing anything
+    - new UserChildParser()  // will be new top-level parser, nothing to do with our XML
+    
+    \par Self-deletion
+    If you transfer control to a sub-parser with just new UserChildParser(this) then
+    it will be automatically self-deleting in its returnToParent method and
+    will invoke OwnedChildOrphansItself to clear our mOwnedChild.
+    
+    The reason for self-deletion being governed by a somewhat complex chain of
+    calls rather than simply a boolean flag is because expatpp has been in use
+    worldwide for many years and it was deemed too unfriendly to break code in
+    a manner which could cause unwanted side effects - the current approach safely
+    preserves self-deletion but also allows for expatpp to have parent parsers
+    own and delete children, without compiling with different options.
+    
+    \note 
+    If you invoke a sub-parser with switchToNewSubParser( new UserChildParser() );
+    then the user child parser will start with a new XML parser instance
+    created by the expatpp ctor. This is safe but slightly wasteful of processing 
+    as the new parser will be discarded by BeAdopted().
+
+    \par Switching to child    and explicitly deleting
+    switchToNewSubParser( somevar = new UserChildParser(this) ) allows you to get values
+    back out of the child parser, in the context of the parent, eg:
+    
+\verbatim
+
+void MultiFilterParser::startElement(const XML_Char* name, const XML_Char **atts)
+{
+    if(strcmp(name,"FilterRequest")==0) {
+        switchToNewSubParser( 
+            mCurrentFilterParser = new FilterRequestParser(this, atts) 
+        );  // we own and will have to explicitly delete 
+...
+}
+        
+void MultiFilterParser::endElement(const XML_Char *name)
+{
+    if(strcmp(name,"FilterRequest")==0) {
+        assert(mCurrentFilterParser);
+        FilterClause* newClause = mCurrentFilterParser->orphanBuiltClause();  // retrieve data built by sub-parser
+...
+        mCurrentFilterParser = 0;
+        DeleteChild();
+    }
+}
+\endverbatim
+*/
+class expatppNesting : public expatpp {
+
+public:
+    expatppNesting(expatppNesting* parent=0);  ///< NOT a copy ctor!! this is a recursive situation
+    virtual ~expatppNesting();
+    
+    void switchToNewSubParser( expatppNesting* pAdoptedChild );
+    expatppNesting* returnToParent();
+
+protected:
+    void BeAdopted(expatppNesting* adoptingParent);
+    void OwnedChildOrphansItself(expatppNesting* callingChild);
+    void RegisterWithParentXMLParser();
+    virtual void AdoptChild(expatppNesting* adoptingChild);
+    virtual void DeleteChild();
+    
+    int    mDepth;    
+    bool mSelfDeleting;   ///< only valid if mParent not null
+    expatppNesting* mParent; ///< may be null the parent owns this object
+    expatppNesting* mOwnedChild;    ///< owned, optional currently active child (auto_ptr not used to avoid STL dependency)
+
+public:
+/// \name interface functions for callbacks
+//@{
+    static void nestedStartElementCallback(void* userData, const XML_Char* name, const XML_Char** atts);
+    static void nestedEndElementCallback(void* userData, const XML_Char* name);
+//@}
+
+
+/// \name overrideables to customise behaviour, must call parent
+//@{
+    virtual void SetupHandlers();
+//@}
+
+private:
+    // Forbid copy-construction and assignment, to prevent double-deletion of mOwnedChild
+                        expatppNesting( const expatppNesting & );
+    expatppNesting &    operator=( const expatppNesting & );
+};
+
+
+// inlines
+
+// -------------------------------------------------------
+//      e x p a t p p
+// -------------------------------------------------------
+inline     
+expatpp::operator XML_Parser() const
+{
+    return mParser;
+}
+
+
+// -------------------------------------------------------
+//      e x p a t p p N e s t i n g
+// -------------------------------------------------------
+inline void
+expatppNesting::OwnedChildOrphansItself(expatppNesting* callingChild)
+{
+    assert(callingChild==mOwnedChild);
+    mOwnedChild = 0;
+}
+
+
+    
+#endif   // H_EXPATPP