SAX based XML parser

Dependents:   giken9_HTMLServer_Temp_Sample

expatpp.cpp

Committer:
andrewbonney
Date:
2011-05-26
Revision:
1:e96b2af301dd
Parent:
0:07919e3d6c56

File content as of revision 1:e96b2af301dd:

// expatpp
#ifdef UNDER_CE
    #include <string.h>
    #include <windows.h>
    #include <dbgapi.h>
    #define assert ASSERT
#else
    #include <string>
    using namespace std;
    #include <assert.h>
#endif
#include "expatpp.h"

#pragma diag_suppress 1299

    
// may be defined in xmltchar.h or elsewhere
#ifndef tcscmp
    #ifdef XML_UNICODE
        #define tcscmp wcscmp
    #else
        #define tcscmp strcmp
    #endif  // XML_UNICODE
#endif  // tcscmp


#ifndef BUFSIZ
    #define BUFSIZ 256 //  Was 4096
#endif

expatpp::expatpp(bool createParser) :
    mParser(0),  // in case of exception below
    mHaveParsed(false)
{
  if (createParser) {
  // subclasses may call this ctor after parser created!
        mParser = XML_ParserCreate(0);
        SetupHandlers();
    }
}


void
expatpp::SetupHandlers()
{
    ::XML_SetUserData(mParser, this);
    ::XML_SetElementHandler(mParser, startElementCallback, endElementCallback);
    ::XML_SetCharacterDataHandler(mParser, charDataCallback);
    ::XML_SetProcessingInstructionHandler(mParser, processingInstructionCallback);
    ::XML_SetDefaultHandler(mParser, defaultHandlerCallback);
    ::XML_SetUnparsedEntityDeclHandler(mParser, unParsedEntityDeclCallback);
    ::XML_SetNotationDeclHandler(mParser, notationDeclCallback);
    ::XML_SetNotStandaloneHandler(mParser, notStandaloneHandlerCallback);
    ::XML_SetNamespaceDeclHandler(mParser, startNamespaceCallback, endNamespaceCallback);
#ifndef EXPATPP_COMPATIBLE_EXPAT12
    ::XML_SetAttlistDeclHandler(mParser, attlistDeclCallback);
    ::XML_SetCdataSectionHandler(mParser, startCdataSectionCallback, endCdataSectionCallback);
    ::XML_SetCommentHandler(mParser, commentCallback);
    ::XML_SetDoctypeDeclHandler(mParser, startDoctypeDeclCallback, endDoctypeDeclCallback);
    ::XML_SetElementDeclHandler(mParser, elementDeclCallback);
    ::XML_SetEntityDeclHandler(mParser, entityDeclCallback);
    ::XML_SetSkippedEntityHandler(mParser, skippedEntityCallback);
    ::XML_SetXmlDeclHandler(mParser, xmlDeclCallback);          
#endif
}


expatpp::~expatpp()
{
    if (mParser)  // allows subclasses to avoid finishing parsing
      ReleaseParser();
}


/**
    Provide single point that will call XML_ParserFree.
    Nothing else in this code should call XML_ParserFree!
*/
void 
expatpp::ReleaseParser()
{
    ::XML_ParserFree(mParser);
    mParser = 0;
}



/**
    Provide single point that will call XML_ParserReset.
    Guarded against trivial reset before use in case that breaks
    expat or creates overhead.
    
    \todo pass in encoding to XML_ParserReset when we support encodings
*/
void 
expatpp::ResetParser()
{
#ifdef EXPATPP_COMPATIBLE_EXPAT12
    assert(!"Reset not available in earlier than expat 1.95.3");s
#else
    /*if (mHaveParsed) {
        ::XML_ParserReset(mParser, NULL);
        SetupHandlers();
        mHaveParsed = false;
    }*/
    if (mHaveParsed) {
        ReleaseParser();
        mParser = XML_ParserCreate(0);
        SetupHandlers();
        mHaveParsed = false;
    }
#endif
}


/**
    Parse entire file, basically copy of the loop from the elements.c example.
*/
XML_Status
expatpp::parseFile(FILE* inFile)
{    
    ResetParser();
    
    char buf[BUFSIZ];
    int done;
    if (!inFile)
          return XML_STATUS_ERROR;
    fseek(inFile, 0, SEEK_SET); // reset for reading
    do {
        size_t len = fread(buf, 1, sizeof(buf), inFile);
        done = len < sizeof(buf);
        enum XML_Status parseStatus;
        if ((parseStatus = XML_Parse(buf, len, done))!=XML_STATUS_OK) {
            return parseStatus;
        }
    } while (!done);
    return XML_STATUS_OK;
}


XML_Status
expatpp::XML_Parse(const char *s, int len, int isFinal)
{
    mHaveParsed = true;
    const XML_Status retStatus = ::XML_Parse(mParser, s, len, isFinal);
    if (isFinal)
        CheckFinalStatus(retStatus);
    return retStatus;
}


XML_Error
expatpp::XML_GetErrorCode()
{
    return ::XML_GetErrorCode(mParser);
}


int
expatpp::XML_GetCurrentLineNumber()
{
    return ::XML_GetCurrentLineNumber(mParser);
}


int
expatpp::XML_GetCurrentColumnNumber()
{
    return ::XML_GetCurrentColumnNumber(mParser);
}




/**
    Parse string which is assumed to be entire XML document.
    Written to stop stupid errors of being off by one in the string length causing 
    wasted debugging time, such as:
\verbatim    
    const char[] kSampleSettings = "<settings/>";
    const int sampleSize = sizeof(kSampleSettings)-1;  // unless you remember to subtract one here will get invalid token error
    if (!parser.XML_Parse(kSampleSettings, sampleSize, 1)) {
\endverbatim    
*/
XML_Status
expatpp::parseString(const char* inString)
{
    ResetParser();
    const int inLen = strlen(inString);
    return XML_Parse(inString, inLen, 1);    
}

void 
expatpp::startElementCallback(void *userData, const XML_Char* name, const XML_Char** atts)
{
    ((expatpp*)userData)->startElement(name, atts);
}


void 
expatpp::endElementCallback(void *userData, const XML_Char* name)
{
    ((expatpp*)userData)->endElement(name);
}


void 
expatpp::startNamespaceCallback(void *userData, const XML_Char* prefix, const XML_Char* uri)
{
    ((expatpp*)userData)->startNamespace(prefix, uri);
}


void 
expatpp::endNamespaceCallback(void *userData, const XML_Char* prefix)
{
    ((expatpp*)userData)->endNamespace(prefix);
}


void 
expatpp::charDataCallback(void *userData, const XML_Char* s, int len)
{
    ((expatpp*)userData)->charData(s, len);
}


void
expatpp:: processingInstructionCallback(void *userData, const XML_Char* target, const XML_Char* data)
{
    ((expatpp*)userData)->processingInstruction(target, data);
}


void
expatpp::defaultHandlerCallback(void* userData, const XML_Char* s, int len)
{
    ((expatpp*)userData)->defaultHandler(s, len);
}


int
expatpp::notStandaloneHandlerCallback(void* userData)
{
    return ((expatpp*)userData)->notStandaloneHandler();
}


void
expatpp::unParsedEntityDeclCallback(void* userData, const XML_Char* entityName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId, const XML_Char* notationName)
{
    ((expatpp*)userData)->unparsedEntityDecl(entityName, base, systemId, publicId, notationName);
}


void
expatpp::notationDeclCallback(void *userData, const XML_Char* notationName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId)
{
    ((expatpp*)userData)->notationDecl(notationName, base, systemId, publicId);
}


void 
expatpp::startElement(const XML_Char*, const XML_Char**)
{}


void 
expatpp::endElement(const XML_Char*)
{}


void 
expatpp::startNamespace(const XML_Char* /* prefix */, const XML_Char* /* uri */)
{}


void 
expatpp::endNamespace(const XML_Char*)
{}


void 
expatpp::charData(const XML_Char*, int )
{
}


void
expatpp::processingInstruction(const XML_Char*, const XML_Char*)
{
}


void
expatpp::defaultHandler(const XML_Char*, int)
{
}


int
expatpp::notStandaloneHandler()
{
    return 0;
}


void
expatpp::unparsedEntityDecl(const XML_Char*, const XML_Char*, const XML_Char*, const XML_Char*, const XML_Char*)
{
}


void
expatpp::notationDecl(const XML_Char*, const XML_Char*, const XML_Char*, const XML_Char*)
{
}


int 
expatpp::skipWhiteSpace(const XML_Char* startFrom)
{
    // use our own XML definition of white space
    // TO DO - confirm this is correct!
    const XML_Char* s = startFrom;
    XML_Char c = *s;
    while ((c==' ') || (c=='\t') || (c=='\n') || (c=='\r')) {
        s++;
        c = *s;
    }
    const int numSkipped = s - startFrom;
    return numSkipped;
}


/**
    Iterate the paired attribute name/value until find a pair with matching name.
    \return pointer to the value or null if not found.
*/
const XML_Char* 
expatpp::getAttribute(const XML_Char* matchingName, const XML_Char** atts)
{
    for (int i=0; atts[i]; i++) {
        const XML_Char* attributeName = atts[i++];
        assert(attributeName);  // shouldn't fail this because of loop test above
        if(tcscmp(attributeName, matchingName)==0) {  
            return atts[i];  // if 2nd item was missing, this returns 0 safely indicating failure
        }
    }
    return 0;
}


/**
\bug will always return 0 for PPC
*/
bool 
expatpp::getIntegerAttribute(const XML_Char *matchingName, const XML_Char **atts, int& outAtt)
{
    const XML_Char* attStr = getAttribute(matchingName, atts);
    if (!attStr)
        return false;
    int i=0;
#ifdef XML_UNICODE
fail to compile because need this now
#else
    sscanf(attStr, "%d", &i);
#endif
    outAtt = i;
    return true;
}


/**
\bug will always return 0 for PPC
*/
bool 
expatpp::getDoubleAttribute(const XML_Char *matchingName, const XML_Char **atts, double& outAtt)
{
    const XML_Char* attStr = getAttribute(matchingName, atts);
    if (!attStr)
        return false;
    float f = 0.0;  // sscanf doesn't allow point to double
#ifdef XML_UNICODE
fail to compile because need this now
#else
    sscanf(attStr, "%f", &f);
#endif
    outAtt = f;
    return true;
}


bool 
expatpp::emptyCharData(const XML_Char *s, int len)
{
// usually call from top of overriden charData methods
    if (len==0)
        return true;  //*** early exit - empty string, may never occur??
        
// skip newline and empty whitespace
    if (
        ((len==1) && ( (s[0]=='\n') || (s[0]=='\r')) ) ||  // just CR or just LF
        ((len==2) && (s[0]=='\r') && (s[1]=='\n'))  // DOS-style CRLF
    )
        return true;  //*** early exit - newline
        
    const int lastCharAt = len-1;
    if (s[lastCharAt]==' ') {  // maybe all whitespace
        int i;
        for (i=0; i<lastCharAt; i++) {
            if (s[i]!=' ')
                break;
        }
        if (i==lastCharAt)
            return true;      //*** early exit - all spaces
    }
    return false;
}


//-------- Added for expat 1.95.5---------------
void
expatpp::attlistDeclCallback(void *userData, 
    const XML_Char *elname,
    const XML_Char *attname,
    const XML_Char *att_type,
    const XML_Char *dflt,
    int             isrequired)
{
    ((expatpp*)userData)->attlistDecl(elname, attname, att_type, dflt, isrequired);
}


void
expatpp::commentCallback(void *userData, const XML_Char *data)
{
    ((expatpp*)userData)->comment(data);
}


void
expatpp::elementDeclCallback(void *userData, const XML_Char *name, XML_Content *model)
{
    ((expatpp*)userData)->elementDecl(name, model);
}


void
expatpp::endCdataSectionCallback(void *userData)
{
    ((expatpp*)userData)->endCdataSection();
}


void
expatpp::endDoctypeDeclCallback(void *userData)
{
    ((expatpp*)userData)->endDoctypeDecl();
}


void
expatpp::entityDeclCallback(void *userData,
    const XML_Char *entityName,
    int is_parameter_entity,
    const XML_Char *value,
    int value_length,
    const XML_Char *base,
    const XML_Char *systemId,
    const XML_Char *publicId,
    const XML_Char *notationName)
{
    ((expatpp*)userData)->entityDecl(entityName, is_parameter_entity, value, value_length, base, systemId, publicId, notationName);
}


void
expatpp::skippedEntityCallback(void *userData, const XML_Char *entityName, int is_parameter_entity)
{
    ((expatpp*)userData)->skippedEntity(entityName, is_parameter_entity);
}


void
expatpp::startCdataSectionCallback(void *userData)
{
    ((expatpp*)userData)->startCdataSection();
}


void
expatpp::startDoctypeDeclCallback(void *userData, 
        const XML_Char *doctypeName,
        const XML_Char *sysid,
        const XML_Char *pubid,
        int has_internal_subset)
{
    ((expatpp*)userData)->startDoctypeDecl(doctypeName, sysid, pubid, has_internal_subset);
}


void
expatpp::xmlDeclCallback(void *userData, const XML_Char      *version,
                                    const XML_Char      *encoding,
                                    int                  standalone)
{
    ((expatpp*)userData)->xmlDecl(version, encoding, standalone);
}


void
expatpp::attlistDecl( 
    const XML_Char *elname,
    const XML_Char *attname,
    const XML_Char *att_type,
    const XML_Char *dflt,
    int             isrequired)
{
}


void
expatpp::comment( const XML_Char *data)
{
}


void
expatpp::elementDecl( const XML_Char *name, XML_Content *model)
{
}


void 
expatpp::endCdataSection()
{
}


void
expatpp::endDoctypeDecl()
{
}


void
expatpp::entityDecl(
    const XML_Char *entityName,
    int is_parameter_entity,
    const XML_Char *value,
    int value_length,
    const XML_Char *base,
    const XML_Char *systemId,
    const XML_Char *publicId,
    const XML_Char *notationName)
{
}


void
expatpp::skippedEntity( const XML_Char *entityName, int is_parameter_entity)
{
}


void 
expatpp::startCdataSection()
{
}


void
expatpp::startDoctypeDecl(const XML_Char *doctypeName,
        const XML_Char *sysid,
        const XML_Char *pubid,
        int has_internal_subset)
{
}


void
expatpp::xmlDecl( const XML_Char      *version,
                                    const XML_Char      *encoding,
                                    int                  standalone)
{
}




// -------------------------------------------------------
//      e x p a t p p N e s t i n g
// -------------------------------------------------------
/**
    \param parent can be null in which case this is root parser
    
    \note The handlers set in here MUST be also set in SetupHandlers
    which is a virtual method invoked by expatpp::ResetParser. Otherwise
    you can have subtle bugs with a nested parser not properly returning
    after reusing a parser (nasty and found rapidly only via extensive unit
    tests and plentiful assertions!).
    
    \WARNING 
    The assumption that is not obvious here is that if you want to use 
    nested parsers, then your topmost parser must also be an expatppNesting
    subclass, NOT an expatpp subclass, because we need the 
    nestedStartElementCallback and nestedEndElementCallback
    callbacks to override those in the expatpp ctor.
    
    
    
    \todo go back over code in detail and confirm above warning still valid
    I think if we used expat's functions to invoke the registered callback
    might be safer - the explicit function call we have in nestedEndElementCallback
    certainly assumes the parent type.
*/
expatppNesting::expatppNesting(expatppNesting* parent) :
    expatpp(parent==0),  // don't create parser - we're taking over from parent if given
    mDepth(0),
    mParent(parent),
    mOwnedChild(0),
    mSelfDeleting(true)
{
    if ( parent )
    {
        RegisterWithParentXMLParser();
        parent->AdoptChild(this);
    }
    else
    {
        // No parent - the expatpp constructor will have created a new mParser (expat parser)
        ::XML_SetElementHandler(mParser, nestedStartElementCallback, nestedEndElementCallback);
    }
    assert(mParser);  // either we created above or expatpp 
}


expatppNesting::~expatppNesting()
{
    assert(!mParent);  // if we are a sub-parser, should not delete without calling returnToParent
    DeleteChild();
}


/**
    Call parent version then override same as in our ctor.
*/
void
expatppNesting::SetupHandlers()
{
    expatpp::SetupHandlers();
    ::XML_SetElementHandler(mParser, nestedStartElementCallback, nestedEndElementCallback);
}

/**
    Must use if you have adopted a child parser and want to dispose of it early.
*/
void
expatppNesting::DeleteChild()
{
    delete mOwnedChild;
    mOwnedChild = 0;
}


/**
    Invoked as a callback from a child ctor when we pass in a parent pointer.
    OR used from switchToNewSubParser, in which case it may be the 2nd time
    we're called for a given child (see scenarios in expatppNesting class comment).
*/
void
expatppNesting::AdoptChild(expatppNesting* adoptingChild)
{
    if ( mOwnedChild != adoptingChild )
    {
        delete mOwnedChild;
        mOwnedChild = adoptingChild;
    }
}


/**
     to use parent's underlying expat parser
*/
void
expatppNesting::RegisterWithParentXMLParser()
{
    mParser = mParent->mParser;
    ::XML_SetUserData(mParser, this);
}


/**
    User code (typically the startElement handler of user parsers derived from expatppNesting) 
    may call 
        switchToNewSubParser( new UserChildParser() );
    to hand off the current document to a child parser that understands the next segment of XML.
    Control will be returned to the original (parent) parser when the end of the child element 
    is reached.
    In its lifetime a 'parent' parser may switch control to several child parsers (one at a time 
    of course) as it moves through the document encoutering various types of child element.
    
    A child to which older code (eg: OOFILE) has just switched control by
    new childParser(this) will be self-deleting and will clear our mOwnedChild in its dtor. 
*/
void expatppNesting::switchToNewSubParser( expatppNesting* pAdoptedChild )
{
    assert(pAdoptedChild);
    AdoptChild(pAdoptedChild);
    pAdoptedChild->BeAdopted(this);
}


/**
    If this is root parser, nestedEndElementCallback won't call returnToParent.
    Therefore it is safe to put parsers on the stack.
*/
expatppNesting* 
expatppNesting::returnToParent()
{
    expatppNesting* ret = mParent;
    ::XML_SetUserData(mParser, mParent);
    mParent=0;
    mParser=0;  // prevent parser shutdown by expatpp::~expatpp!!
    if (mSelfDeleting) {
        ret->OwnedChildOrphansItself(this);
        delete this;  // MUST BE LAST THING CALLED IN NON-VIRTUAL FUNCTION, NO MEMBER ACCESS
    }
    return ret;
}


void 
expatppNesting::nestedStartElementCallback(void *userData, const XML_Char* name, const XML_Char** atts)
{
    assert(userData);
    expatppNesting* nestedParser = (expatppNesting*)userData;
    nestedParser->mDepth++;
    nestedParser->startElement(name, atts);  // probably user override
}


/**
    If this is root parser, will never hit nestedEndElementCallback after closing element,
    except for when we call it.
    \param userData should be non-nil except for specific case of ending root
*/
void 
expatppNesting::nestedEndElementCallback(void *userData, const XML_Char* name)
{
    if (!userData)
        return;  //  end tag for root
        
    expatppNesting* nestedParser = (expatppNesting*)userData;
// we don't know until we hit a closing tag 'outside' us that our run is done     
    if (nestedParser->mDepth==0) {
        expatppNesting* parentParser = nestedParser->returnToParent();
        nestedEndElementCallback(parentParser, name);   // callbacks for expatppNesting stay registered, so safe 
        //if we don't invoke their callback, they will not balance their mDepth        
    }
    else {
    // end of an element this parser has started - normal case
        nestedParser->endElement(name);  // probably user override
        nestedParser->mDepth--;
    }
}


/**
    Called by switchToNewSubParser to indicate a newly created child parser
    is now the currently active child for adoptingParent and the child
    isn't expected to be self deleting.
    
    Normal code to create an owned child would be either
        switchToNewSubParser( new UserChildParser(this) );
    where this is the currently active parser and you want to be deleting it, or
        new UserChildParser(this);
    to have a child parser self-delete
    
    \par Important Safety Note
    Copes with the situation of people forgetting to pass 
    in the parent parser (and hence creating a new one by default)
    if invoked by switchToNewSubParser( new UserChildParser() )
    by somewhat wastefully deleting the parser created in expatpp::expatpp
    by us being a root parser.    
*/
void
expatppNesting::BeAdopted(expatppNesting* adoptingParent)
{
    if (mParent) {
        assert(mParent==adoptingParent);
    }
    else {  // root parser being adopted, cleanup!
        ReleaseParser();
        mParent = adoptingParent;
        RegisterWithParentXMLParser();
    }
    mSelfDeleting = false;
}