XML Parsing

This talk provides a quick overview of XML parsing using Java

1 Parsing

2 Simple API for XML

2.1 SAX Sample Application

import java.io.FileReader;

import org.xml.sax.XMLReader;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.helpers.XMLReaderFactory;
import org.xml.sax.helpers.DefaultHandler;

/** Our handler must extend DefaultHandler */
public class MySAXApp extends DefaultHandler
{

  public static void main (String args[])
    throws Exception
  {
    //Create an instance of the XML parser.
    XMLReader xr = XMLReaderFactory.createXMLReader();

    //Create an instance of our handles (see below)
    MySAXApp handler = new MySAXApp();
    xr.setContentHandler(handler);
    xr.setErrorHandler(handler); //does double duty.

    // Parse each *file* provided on the
    // command line.
    for (int i = 0; i < args.length; i++) {
      FileReader r = new FileReader(args[i]);
      xr.parse(new InputSource(r));
    }
  }

  public MySAXApp ()
  {
    super();
  }


  ////////////////////////////////////////////////////////////////////
  // Event handlers.
  ////////////////////////////////////////////////////////////////////


  public void startDocument ()
  {
    System.out.println("Start document");
  }


  public void endDocument ()
  {
    System.out.println("End document");
  }

  /**
   * Receive notification at the beginning of an element.
   *
   * @param uri The Namespace URI, or the empty string if the element
     has no Namespace URI or if Namespace processing is not being
     performed.
   * @param name The local name (without prefix), or the empty string
     if Namespace processing is not being performed.
   * @param qName The qualified name (with prefix), or the empty
     string if qualified names are not available.[namespaceprefix:name]
   * @param atts The attributes attached to the element. If there are
     no attributes, it shall be an empty Attributes object.
   */
  public void startElement (String uri, String name,
                            String qName, Attributes atts)
  {
    if ("".equals (uri))
      System.out.println("Start element: " + qName);
    else
      System.out.println("Start element: {" + uri + "}" + name);
  }


  public void endElement (String uri, String name, String qName)
  {
    if ("".equals (uri))
      System.out.println("End element: " + qName);
    else
      System.out.println("End element:   {" + uri + "}" + name);
  }

  /**
   * Receive notification of character data.
   *
   * @param ch[] The characters from the XML document.
   * @param start The start position in the array.
   * @param length The number of characters to read from the array
   */
  public void characters (char ch[], int start, int length)
  {
    System.out.print("Characters:    \"");
    for (int i = start; i < start + length; i++) {
      switch (ch[i]) {
      case '\\':
        System.out.print("\\\\");
        break;
      case '"':
        System.out.print("\\\"");
        break;
      case '\n':
        System.out.print("\\n");
        break;
      case '\r':
        System.out.print("\\r");
        break;
      case '\t':
        System.out.print("\\t");
        break;
      default:
        System.out.print(ch[i]);
        break;
      }
    }
    System.out.print("\"\n");
  }

}

2.2 Application Output

<?xml version="1.0"?>

<poem xmlns="http://www.megginson.com/ns/exp/poetry">
  <title>Roses are Red</title>
  <l>Roses are red,</l>
  <l>Violets are blue;</l>
  <l>Sugar is sweet,</l>
  <l>And I love you.</l>
</poem>

3 Document Object Model

3.1 DOM Structure

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ORDERS SYSTEM "orders.dtd">
<orders>
  <order>
    <customerid limit="1000">12341</customerid>
    <status>pending</status>
    <item instock="Y" itemid="SA15">
      <name>Silver Show Saddle, 16 inch</name>
      <price>825.00</price>
      <qty>1</qty>
    </item>
    <item instock="N" itemid="C49">
      <name>Premium Cinch</name>
      <price>49.00</price>
      <qty>1</qty>
    </item>
  </order>
  <order>
    <customerid limit="150">251222</customerid>
    <status>pending</status>
    <item instock="Y" itemid="WB78">
      <name>Winter Blanket (78 inch)</name>
      <price>20</price>
      <qty>10</qty>
    </item>
  </order>
</orders>

DOM Tree

3.2 Node Types

3.3 DOM Parser

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File;
import org.w3c.dom.Document;

public class OrderProcessor {
  public static void main (String args[]) {
    File docFile = new File("orders.xml");
    Document doc = null;      
    try {
      DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
      
      //A document builder builds specific documents.
      DocumentBuilder db = dbf.newDocumentBuilder();

      //This call parses the file and creates the Document in memory.
      doc = db.parse(docFile);
    } catch (Exception e) {
      System.out.print("Problem parsing the file.");
    }
  }
}

3.4 Traversing the Document

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File;
import org.w3c.dom.*;

public class OrderProcessor {
  public static void main (String args[]) {

    //some code here
    
    doc = db.parse(docFile);

    //STEP 1:  Get the root element
    Element root = doc.getDocumentElement();
    System.out.println("The root element is "+root.getNodeName());
    
    //STEP 2:  Get the children
    NodeList children = root.getChildNodes();
    System.out.println("There are "+children.getLength()
                                  +" nodes in this document.");

    //STEP 3:  Step through the children
    for (Node child = root.getFirstChild(); 
         child != null;
         child = child.getNextSibling())
    {
      //The node name is either the tag name or #text
      System.out.println(child.getNodeName()+" = "+child.getNodeValue());   
    }

    //STEP 4:  Recurse this functionality
    stepThrough(root);


  }

  private static void stepThrough (Node start)
  {
    System.out.println(start.getNodeName()+" = "+start.getNodeValue());

    //if its a node, then print the attributes.
    if (start.getNodeType() == start.ELEMENT_NODE) 
    {   
      NamedNodeMap startAttr = start.getAttributes();
      for (int i = 0; 
           i < startAttr.getLength();
           i++) {
        Node attr = startAttr.item(i);
        System.out.println("  Attribute:  "+ attr.getNodeName()
                           +" = "+attr.getNodeValue());
      }   
    } 
    
    
    for (Node child = start.getFirstChild(); 
         child != null;
         child = child.getNextSibling())
    {
      stepThrough(child);
    }
  }

}

3.5 Modifying a Document

public class OrderProcessor {

  /** Change the value of node elemName to elemValue */
  private static void changeOrder (Node start, 
                                   String elemName, 
                                   String elemValue)
  {
    if (start.getNodeName().equals(elemName)) {
      start.getFirstChild().setNodeValue(elemValue);
    }
         
    for (Node child = start.getFirstChild(); 
         child != null;
         child = child.getNextSibling())
    {
      changeOrder(child, elemName, elemValue);
    }
  }

  public static void main (String args[]) {
        
    // Change text value of node named status to processing.
    changeOrder(root, "status", "processing");

    //Get a list of nodes that have a status element.
    NodeList orders = root.getElementsByTagName("status");

    for (int orderNum = 0; 
         orderNum < orders.getLength(); 
         orderNum++) 
    {
      System.out.println(orders.item(groupNum).getFirstChild().getNodeValue());

      Element thisOrder = (Element)orders.item(orderNum);

      //Remove the limit attribute from customer.
      Element customer = (Element)thisOrder.getElementsByTagName("cusomertid").item(0);
      customer.removeAttribute("limit");  
      
      NodeList orderItems = thisOrder.getElementsByTagName("item");
      double total = 0;
      for (int itemNum = 0;
           itemNum < orderItems.getLength();
           itemNum++) {
      
        // Total up cost for each item and 
        // add to the order total
        
        //Get this item as an Element
        Element thisOrderItem = (Element)orderItems.item(itemNum);

        //Remove a node.
        //Remove anything with <item instock="N">
        if (thisOrderItem.getAttributeNode("instock").getNodeValue().equals("N")) {
          Node deadNode = thisOrderItem.getParentNode().removeChild(thisOrderItem);

          continue;

          //Alternatively, we could have replaced this item with a backorderd element.
          // <item itemid="123">
          //   <backordered></backordered>
          // </item>
          Element backElement = doc.createElement("backordered");

          //<backordered itemid="">
          backElement.setAttributeNode(doc.createAttribute("itemid"));

          //<backordered itemid="123">
          String itemIdString = thisOrderItem.getAttributeNode("itemid").getNodeValue();
          backElement.setAttribute("itemid", itemIdString);

          Node deadNode = thisOrderItem.getParentNode()
            .replaceChild(backElement, thisOrderItem);
        } 

        //Get pricing information for this Item
        String thisPrice = thisOrderItem.getElementsByTagName("price").item(0)
          .getFirstChild().getNodeValue();
        double thisPriceDbl = new Double(thisPrice).doubleValue();
        
        //Get quantity information for this Item
        String thisQty = thisOrderItem.getElementsByTagName("qty").item(0)
          .getFirstChild().getNodeValue();
        double thisQtyDbl = new Double(thisQty).doubleValue();

        double thisItemTotal = thisPriceDbl*thisQtyDbl;
        total = total + thisItemTotal;
      }
      String totalString = new Double(total).toString();

      //1234.34
      Node totalNode = doc.createTextNode(totalString);

      //<total></total>
      Element totalElement = doc.createElement("total");

      //<total>1234.34</total>
      totalElement.appendChild(totalNode);

      //Add that element before anyone else.
      thisOrder.insertBefore(totalElement, thisOrder.getFirstChild());

    }
  }
}

3.6 Outputting a Document

try 
{
  File newFile = new File("processedOrders.xml");
  FileWriter newFileStream = new FileWriter(newFile);
  newFileStream.write("<?xml version=\"1.0\"?>");
  newFileStream.write("<!DOCTYPE "+doc.getDoctype().getName()+" ");
  if (doc.getDoctype().getSystemId() != null) 
  {
    newFileStream.write(" SYSTEM ");
    newFileStream.write(doc.getDoctype().getSystemId());   
  }
  if (doc.getDoctype().getPublicId() != null) 
  {
    newFileStream.write(" PUBLIC ");
    newFileStream.write(doc.getDoctype().getPublicId());   
  }
  newFileStream.write(">");
         
  newFileStream.write(newRoot.toString());

  newFileStream.close();

} catch (IOException e) {
  System.out.println("Can't write new file.");   
}

4 Validating

4.1 Doctype

4.2 Xerces Validation

  1. Create a parser.
  2. Turn on validation.
  3. Set the error handler.
  4. Parse the document.

4.3 Error Handler

import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.SAXParseException;

/** A simple error handlier which just prints out
    the errors. */
public class ErrorChecker extends DefaultHandler
{

  public ErrorChecker() {
  }
   
  public void error (SAXParseException e) {
    System.out.println("Parsing error:  "+e.getMessage());
  }

  public void warning (SAXParseException e) {
    System.out.println("Parsing problem:  "+e.getMessage());
  }

  public void fatalError (SAXParseException e) {
    System.out.println("Parsing error:  "+e.getMessage());
    System.out.println("Cannot continue.");
    System.exit(1);
  }
}

4.4 Xerces Validator

import org.apache.xerces.parsers.DOMParser;
import java.io.File;
import org.w3c.dom.Document;

public class SchemaTest {
  public static void main (String args[]) {
    File docFile = new File("memory.xml");
    
    try {

      DOMParser parser = new DOMParser();
      parser.setFeature("http://xml.org/sax/features/validation", true); 

      //Here we specificy the schema location,
      //but we could have used the ones specified in the document.
      parser.setProperty(
                         "http://apache.org/xml/properties/schema/external-noNamespaceSchemaLocation",
                         "memory.xsd"); 
      
      ErrorChecker errors = new ErrorChecker();
      parser.setErrorHandler(errors);

      parser.parse("memory.xml");
    } catch (Exception e) {
      System.out.print("Problem parsing the file.");
    }   
  }
}

URLs

  1. Validating XML, http://www-106.ibm.com/developerworks/xml/edu/x-dw-xvalid-i.html
  2. SAX Quickstart, http://www.saxproject.org/?selected=quickstart
  3. Understanding DOM, http://www-106.ibm.com/developerworks/xml/edu/x-dw-xudom-i.html
  4. Processing XML with Java, http://www.ibiblio.org/xml/books/xmljava/
  5. jargon:parse, http://jargon.watson-net.com/jargon.asp?w=parse
  6. SAX API (Javadoc), http://www.saxproject.org/apidoc/overview-summary.html
  7. DOM, http://www.w3c.org/DOM/
  8. Level 2, http://www.w3c.org/DOM/DOMTR#dom2

This talk available at http://jmvidal.cse.sc.edu/talks/xmlparsing/
Copyright © 2009 José M. Vidal . All rights reserved.

18 March 2004, 02:10PM