XML. (was: Re: [ODE] outputting objects)

Nate W coding at natew.com
Tue Feb 4 13:42:01 2003


On Tue, 4 Feb 2003, Martin C. Martin wrote:

> Yeah, what you really need is a parser generator, like yacc, but that
> generates top down parsers.  Recursive descent parsers would be ideal.

I sorta built a recursive descent parser atop the SAX API.  The
start-element, data, and end-element callbacks are included below.  I'm
really not happy with the speed though.  It could be that I have a
throw/catch for every tag, it could be that there are typically several
string compares per tag.  

I think a big speed increased would be realized by loading the XML data
into memory, hashing each tag to a 32-bit int, and then use int compares
rather than string compares.

Then use some recursive descent through the in-memory version of the data,
but have the object deserialization methods look at the hash-and-data
pairs at the current "descent level" and do reflection there.  More later,
when I have time....


/****************************************************************************/
/** XmlStreamReader::vStartElement 
 **
 ** When a tag opens, the reader asks the object at the top of the stack how 
 ** to handle the data within the tag.  The object at the top of the stack 
 ** pay provide a new XmlStreamable object, or may provide a pointer to a 
 ** primitive data type whose value is to be set by the tag's data.
****************************************************************************/
void StreamReader::vStartElement (const wchar_t *pwchLocalName)
{
	Serializable *pCurrentObject = 0;

	// If the new tag matches the object at the top of the stack, push
	if (!wcscmp (pwchLocalName, m_wszTopTag))
	{
		m_ObjectStack.push (m_pTopObject);
		return;
	}

	// The new data will be taken by the object at the top of the stack
	if (m_ObjectStack.size () > 0)
		pCurrentObject = m_ObjectStack.top ();

	// If there is no current object, something has gone horribly wrong
	if (!pCurrentObject)
		return;

	// Set the state of the stream
	m_Stream.m_eReadState = Stream::rsStartElement;
	m_Stream.m_wszCurrentTag = pwchLocalName;

	// This will be used to indicate whether the current tag was processed 
	bool fCaught = false;

	try
	{
		// Allow the object at the top of the stack to determine what to do next.
		// The object will throw something to indicate what is to be done.
		pCurrentObject->vSerialize (m_Stream);			
	}
	catch (Serializable *pObject)
	{
		// If a new XmlStreamable is thrown, put the new object atop the stack
		m_ObjectStack.push (pObject);
		fCaught = true;
	}
		
	// If a pointer to a primitive data type is thrown, the pointer and 
	// data type are stored for use in the vCharacters callback (see below).
	catch (int *pi)
	{
		m_PrimitivePointer.pInteger = pi;
		m_eCurrentPrimitiveDataType = pdtInteger;
		fCaught = true;
	}
	catch (unsigned int *pu)
	{
		// this 'unsigned' stuff is a bit of a hack, being cast to 'signed' for the moment
		m_PrimitivePointer.pInteger = (int*) pu;
		m_eCurrentPrimitiveDataType = pdtInteger;
		fCaught = true;
	}
	catch (void **pp)
	{
		m_PrimitivePointer.pPointer = pp;
		m_eCurrentPrimitiveDataType = pdtPointer;
		fCaught = true;
	}
	catch (std::string *pstr)
	{
		m_PrimitivePointer.pString = pstr;
		m_eCurrentPrimitiveDataType = pdtString;
		fCaught = true;
	}
	catch (real *pr)
	{
		m_PrimitivePointer.pReal = pr;
		m_eCurrentPrimitiveDataType = pdtReal;
		fCaught = true;
	}
	catch (bool *pf)
	{
		m_PrimitivePointer.pBoolean = pf;
		m_eCurrentPrimitiveDataType = pdtBoolean;
		fCaught = true;
	}

	// if nothing was caught, the tag wasn't recognized
	if (!fCaught)
	{
		// Not sure what's best course of action at this point.  Walk the 
		// stack of XmlStreamable objects and try to find a match?
	}
	else
	{
		// Uncomment this stuff to aid debugging
		// OutputDebugString ("Caught ");
		// OutputDebugStringW (pwchLocalName);
		// OutputDebugString ("\n");
	}
}


/****************************************************************************/
/** XmlStreamReader::vCharacters
 **
 ** The characters will be parsed according to the type of primitive that 
 ** was thrown during the previous call to vStartElement.
****************************************************************************/
void StreamReader::vCharacters (const wchar_t *_pwchData, int iCharacters)
{
	// Copy the data into a local buffer
	wchar_t *pwchData = (wchar_t*) alloca (iCharacters * 2 + 10);
	memcpy (pwchData, _pwchData, iCharacters * 2);
	pwchData[iCharacters] = 0;

	// Error check
	if (pdtInvalid == m_eCurrentPrimitiveDataType)
		return;

	// Switch on the type of primitive currently expected
	switch (m_eCurrentPrimitiveDataType)
	{
	case pdtInteger:
		*m_PrimitivePointer.pInteger = _wtoi (pwchData);
		break;

	case pdtUnsigned:
		// TODO: this should be handled with something like _wtou (pwchData)... 
		// But, there is no "_wtou" as yet.  Unsigned ints are currently treated
		// as signed ints.  So far so good, but this really oughtta be fixed the 
		// right way.
		*m_PrimitivePointer.pInteger = 0;
		break;

	case pdtPointer:
		*((int*)(m_PrimitivePointer.pPointer)) = _wtoi (pwchData);
		break;

	case pdtString:
		{
		// Convert the string to 8-bit format
		char *szTemp = (char*) alloca (iCharacters + 1);
		wcstombs (szTemp, pwchData, iCharacters + 1);

		// null-terminate the 8-bit string
		szTemp[iCharacters] = 0;

		// String copy can be done via an external function, if the caller wishes to
		// manage the buffer that way.
		if (m_pfnStringAssign)
			m_pfnStringAssign (m_PrimitivePointer.pString, szTemp);
		else
			*m_PrimitivePointer.pString = szTemp;

		break;
		}

	case pdtReal:
		*m_PrimitivePointer.pReal = (real) _wtof (pwchData);
		break;

	case pdtBoolean:
		{
		// Get the first couple characters into 8-bit format
		char szTemp[5];
		wcstombs (szTemp, pwchData, 2);
		char cFirst = szTemp[0];

		if ((cFirst == 't') || (cFirst == 'T') || (cFirst == 'y') || (cFirst == 'Y') || (cFirst == '1'))
		{
			*m_PrimitivePointer.pBoolean = true;
		}

		if ((cFirst == 'f') || (cFirst == 'F') || (cFirst == 'n') || (cFirst == 'N') || (cFirst == '0'))
		{
			*m_PrimitivePointer.pBoolean = false;
		}
		break;
		}

	case pdtInvalid:
	default:
		break;
	}

	// Reset the expected primitive data type to 'invalid'
	m_eCurrentPrimitiveDataType = pdtInvalid;
}

/****************************************************************************/
/** XmlStreamReader::vEndElement 
****************************************************************************/
void StreamReader::vEndElement (const wchar_t *pwchLocalName)
{
	Serializable *pCurrentObject = 0;

	// Get the top object from the stack
	if (m_ObjectStack.size () > 0)
		pCurrentObject = m_ObjectStack.top ();

	// If it matches the element that just ended, pop it from the stack
	if (pCurrentObject && pCurrentObject->wszGetActualTagName () && !wcscmp (pwchLocalName, pCurrentObject->wszGetActualTagName ()))
		m_ObjectStack.pop ();
}