SAX based XML parser

Dependents:   giken9_HTMLServer_Temp_Sample

expatpp.h

Committer:
andrewbonney
Date:
2011-05-26
Revision:
1:e96b2af301dd
Parent:
0:07919e3d6c56

File content as of revision 1:e96b2af301dd:

// expatpp
#ifndef H_EXPATPP
#define H_EXPATPP

#ifdef EXPATPP_COMPATIBLE_EXPAT12 // earlier versions of expat up to v1.2 
    #include "xmlparse.h"
#else
    #include "expat.h"  // since some version of expat moved to SourceForge
#endif
#include <stdio.h>
#include <assert.h>


/**
\file expatpp.h
Latest version 29-Dec-2002 compatible with expat 1.95.6
*/

/**
expatpp follows a simple pattern for converting the semi-OOP callback design of 
expat into a true class which allows you to override virtual methods to supply
callbacks.

\par USING expatpp
see testexpatpp.cpp for a detailed example

1) decide which callbacks you wish to use, eg: just startElement

2) declare a subclass of expatpp, eg:
class myExpat : public expatpp {
    virtual void startElement(const XML_Char* name, const XML_Char** atts);
};

3) create an instance of your object and pass in a buffer to parse
myExpat parser;
parser.XML_Parse(buf, len, done)


\par HOW IT WORKS
The User Data which expat maintains is simply a pointer to an instance of your object.

Inline static functions are specified as the callbacks to expat.
These static functions take the user data parameter returned from expat and cast it
to a pointer to an expatpp object.

Using that typed pointer they then call the appropriate virtual method.

If you have overriden a given virtual method then your version will be called, otherwise
the (empty) method in the base expatpp class is called.

\par Possible Efficiency Tactic
For efficiency, you could provide your own constructor and set some of the callbacks
to 0, so expat doesn't call the static functions. (untested idea).

\par Naming Conventions
The virtual functions violate the usual AD Software convention of lowercase first letter 
for public methods but this was a late change to protected and too much user code out there.


\todo Possibly implement some handling for XML_SetExternalEntityRefHandler which does NOT
receive user data, just the parser, so can't use normal pattern for invoking virtual methods

\todo Possibly implement handling for XML_UnknownEncodingHandler.

\todo review design for nested calls - not happy that it is the right thing that they don't see
their start and ending elements - makes it harder to unit test them in isolation.

\todo unit tests

\todo especially test abort mechanism

\todo reinstate copy constrution and assignment with child parser cleanup

\todo allow specification of encoding
*/
class expatpp {
public:
    expatpp(bool createParser=true);
    virtual ~expatpp();

    operator XML_Parser() const;
    
protected:  // callback virtuals should only be invoked through our Callback static functions
    bool emptyCharData(const XML_Char* s, int len);  // utility often used in overridden charData

// overrideable callbacks
    virtual void startElement(const XML_Char* name, const XML_Char** atts);
    virtual void endElement(const XML_Char*);
    virtual void charData(const XML_Char*, int len);
    virtual void processingInstruction(const XML_Char* target, const XML_Char* data);
    virtual void defaultHandler(const XML_Char*, int len);
    virtual int notStandaloneHandler();
    virtual void unparsedEntityDecl(const XML_Char* entityName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId, const XML_Char* notationName);
    virtual void notationDecl(const XML_Char* notationName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId);
    virtual void startNamespace(const XML_Char* prefix, const XML_Char* uri);
    virtual void endNamespace(const XML_Char*);
/// \name Callbacks added to support expat 1.95.5
//@{
    virtual void attlistDecl( 
        const XML_Char *elname,
        const XML_Char *attname,
        const XML_Char *att_type,
        const XML_Char *dflt,
        int             isrequired);    
    virtual void endCdataSection();    
    virtual void endDoctypeDecl();    
    virtual void comment( const XML_Char *data);    
    virtual void elementDecl( const XML_Char *name, XML_Content *model);    
    virtual void entityDecl(
        const XML_Char *entityName,
        int is_parameter_entity,
        const XML_Char *value,
        int value_length,
        const XML_Char *base,
        const XML_Char *systemId,
        const XML_Char *publicId,
        const XML_Char *notationName);    
    virtual void skippedEntity(const XML_Char *entityName, int is_parameter_entity);    
    virtual void startCdataSection();
    virtual void startDoctypeDecl(const XML_Char *doctypeName,
        const XML_Char *sysid,
        const XML_Char *pubid,
        int has_internal_subset);    
    virtual void xmlDecl( const XML_Char      *version,
                                        const XML_Char      *encoding,
                                        int                  standalone);
//@}

public:    
/// \name XML interfaces
//@{
    XML_Status XML_Parse(const char* buffer, int len, int isFinal);
    virtual XML_Status  parseFile(FILE* inFile);
    virtual XML_Status  parseString(const char*);
    XML_Error XML_GetErrorCode();
    int XML_GetCurrentLineNumber();
    int XML_GetCurrentColumnNumber();
//@}
    
protected:
    XML_Parser mParser;
    bool mHaveParsed;
    
/// \name overrideables to customise behaviour, must call parent
//@{
    virtual void ReleaseParser();
    virtual void ResetParser();
    virtual void SetupHandlers();
//@}

/**
    Override so subclass can react to an error causing exit from parse.
    rather than leave it for application code to check status.
    Useful point to insert logging to silently grab failed parses
*/
    virtual void CheckFinalStatus(XML_Status) {};
        
// static interface functions for callbacks
public:
    static void startElementCallback(void *userData, const XML_Char* name, const XML_Char** atts);
    static void endElementCallback(void *userData, const XML_Char* name);
    static void startNamespaceCallback(void *userData, const XML_Char* prefix, const XML_Char* uri);
    static void endNamespaceCallback(void *userData, const XML_Char* prefix);
    static void charDataCallback(void *userData, const XML_Char* s, int len);
    static void processingInstructionCallback(void *userData, const XML_Char* target, const XML_Char* data);
    static void defaultHandlerCallback(void* userData, const XML_Char* s, int len);
    static int notStandaloneHandlerCallback(void* userData);    
    static void unParsedEntityDeclCallback(void* userData, const XML_Char* entityName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId, const XML_Char* notationName);
    static void notationDeclCallback(void *userData, const XML_Char* notationName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId);
/// \name Callback interfacess added to support expat 1.95.5
//@{
    static void attlistDeclCallback(void *userData,  
        const XML_Char *elname,
        const XML_Char *attname,
        const XML_Char *att_type,
        const XML_Char *dflt,
        int             isrequired);    
    static void commentCallback(void *userData,  const XML_Char *data);    
    static void elementDeclCallback(void *userData,  const XML_Char *name, XML_Content *model);    
    static void endCdataSectionCallback(void *userData);
    static void endDoctypeDeclCallback(void *userData);    
    static void entityDeclCallback(void *userData, 
        const XML_Char *entityName,
        int is_parameter_entity,
        const XML_Char *value,
        int value_length,
        const XML_Char *base,
        const XML_Char *systemId,
        const XML_Char *publicId,
        const XML_Char *notationName);    
    static void skippedEntityCallback(void *userData,  const XML_Char *entityName, int is_parameter_entity);    
    static void startCdataSectionCallback(void *userData);
    static void startDoctypeDeclCallback(void *userData, 
        const XML_Char *doctypeName,
        const XML_Char *sysid,
        const XML_Char *pubid,
        int has_internal_subset);    
    static void xmlDeclCallback(void *userData,  const XML_Char      *version,
                                        const XML_Char      *encoding,
                                        int                  standalone);
//@}

    
// utilities
    static int skipWhiteSpace(const XML_Char*);
    static const XML_Char* getAttribute(const XML_Char *matchingName, const XML_Char **atts);
    static bool getIntegerAttribute(const XML_Char *matchingName, const XML_Char **atts, int& outAtt);
    static bool getDoubleAttribute(const XML_Char *matchingName, const XML_Char **atts, double& outAtt);
};


/**
    subclass to support a hierarchy of parsers, in a sort of recursion or
    'nesting' approach, where a top-level parser might create sub-parsers 
    for part of a file.
    
    The currently active child parser is owned (mOwnedChild) and is deleted
    by DeleteChild (invoked from the dtor) so error handling can propagate 
    up the tree, closing parsers, without leaks.

    \par Switching to sub-parsers
    You can transfer to a sub-parser with
    - new UserChildParser(this)  // carries on using our parser, is self-deleting
    - switchToNewSubParser( someVar = new UserChildParser(this) )  // if want to get values back after end parsing

    \warning You can accidentally invoke a new parser without it doing anything
    - new UserChildParser()  // will be new top-level parser, nothing to do with our XML
    
    \par Self-deletion
    If you transfer control to a sub-parser with just new UserChildParser(this) then
    it will be automatically self-deleting in its returnToParent method and
    will invoke OwnedChildOrphansItself to clear our mOwnedChild.
    
    The reason for self-deletion being governed by a somewhat complex chain of
    calls rather than simply a boolean flag is because expatpp has been in use
    worldwide for many years and it was deemed too unfriendly to break code in
    a manner which could cause unwanted side effects - the current approach safely
    preserves self-deletion but also allows for expatpp to have parent parsers
    own and delete children, without compiling with different options.
    
    \note 
    If you invoke a sub-parser with switchToNewSubParser( new UserChildParser() );
    then the user child parser will start with a new XML parser instance
    created by the expatpp ctor. This is safe but slightly wasteful of processing 
    as the new parser will be discarded by BeAdopted().

    \par Switching to child    and explicitly deleting
    switchToNewSubParser( somevar = new UserChildParser(this) ) allows you to get values
    back out of the child parser, in the context of the parent, eg:
    
\verbatim

void MultiFilterParser::startElement(const XML_Char* name, const XML_Char **atts)
{
    if(strcmp(name,"FilterRequest")==0) {
        switchToNewSubParser( 
            mCurrentFilterParser = new FilterRequestParser(this, atts) 
        );  // we own and will have to explicitly delete 
...
}
        
void MultiFilterParser::endElement(const XML_Char *name)
{
    if(strcmp(name,"FilterRequest")==0) {
        assert(mCurrentFilterParser);
        FilterClause* newClause = mCurrentFilterParser->orphanBuiltClause();  // retrieve data built by sub-parser
...
        mCurrentFilterParser = 0;
        DeleteChild();
    }
}
\endverbatim
*/
class expatppNesting : public expatpp {

public:
    expatppNesting(expatppNesting* parent=0);  ///< NOT a copy ctor!! this is a recursive situation
    virtual ~expatppNesting();
    
    void switchToNewSubParser( expatppNesting* pAdoptedChild );
    expatppNesting* returnToParent();

protected:
    void BeAdopted(expatppNesting* adoptingParent);
    void OwnedChildOrphansItself(expatppNesting* callingChild);
    void RegisterWithParentXMLParser();
    virtual void AdoptChild(expatppNesting* adoptingChild);
    virtual void DeleteChild();
    
    int    mDepth;    
    bool mSelfDeleting;   ///< only valid if mParent not null
    expatppNesting* mParent; ///< may be null the parent owns this object
    expatppNesting* mOwnedChild;    ///< owned, optional currently active child (auto_ptr not used to avoid STL dependency)

public:
/// \name interface functions for callbacks
//@{
    static void nestedStartElementCallback(void* userData, const XML_Char* name, const XML_Char** atts);
    static void nestedEndElementCallback(void* userData, const XML_Char* name);
//@}


/// \name overrideables to customise behaviour, must call parent
//@{
    virtual void SetupHandlers();
//@}

private:
    // Forbid copy-construction and assignment, to prevent double-deletion of mOwnedChild
                        expatppNesting( const expatppNesting & );
    expatppNesting &    operator=( const expatppNesting & );
};


// inlines

// -------------------------------------------------------
//      e x p a t p p
// -------------------------------------------------------
inline     
expatpp::operator XML_Parser() const
{
    return mParser;
}


// -------------------------------------------------------
//      e x p a t p p N e s t i n g
// -------------------------------------------------------
inline void
expatppNesting::OwnedChildOrphansItself(expatppNesting* callingChild)
{
    assert(callingChild==mOwnedChild);
    mOwnedChild = 0;
}


    
#endif   // H_EXPATPP