Transforming XML & the REXML Pull Parser
by James Britt

Listing One

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE office:document-content PUBLIC "-//OpenOffice.org//DTD 
                                 OfficeDocument 1.0//EN" "office.dtd">
<office:document-content 
xmlns:office="http://openoffice.org/2000/office" 
xmlns:style="http://openoffice.org/2000/style" 
xmlns:text="http://openoffice.org/2000/text" 
xmlns:table="http://openoffice.org/2000/table" 
xmlns:draw="http://openoffice.org/2000/drawing" 
xmlns:fo="http://www.w3.org/1999/XSL/Format" 
xmlns:xlink="http://www.w3.org/1999/xlink" 
xmlns:number="http://openoffice.org/2000/datastyle" 
xmlns:svg="http://www.w3.org/2000/svg" 
xmlns:chart="http://openoffice.org/2000/chart"
xmlns:dr3d="http://openoffice.org/2000/dr3d" 
xmlns:math="http://www.w3.org/1998/Math/MathML" 
xmlns:form="http://openoffice.org/2000/form" 
xmlns:script="http://openoffice.org/2000/script" 
office:class="text" office:version="1.0">
  <!-- An example conten.xml file form an OpenOffice.org Writer document -->
 <office:script/>
 <office:font-decls>
  <style:font-decl style:name="Arial" fo:font-family="Arial"/>
  <style:font-decl style:name="Baskerville BE Regular" fo:font-
    family="&apos;Baskerville BE Regular&apos;, &apos;Times New Roman&apos;"/>
  <style:font-decl style:name="Lucidasans1" fo:font-family="Lucidasans"/>
  <style:font-decl style:name="Bitstream Vera Sans" fo:font-
    family="&apos;Bitstream Vera Sans&apos;" style:font-pitch="variable"/>
  <style:font-decl style:name="Lucidasans" fo:font-
    family="Lucidasans" style:font-pitch="variable"/>
  <style:font-decl style:name="Mincho" fo:font-family="Mincho"
    style:font-pitch="variable"/>
 </office:font-decls>
 <office:automatic-styles/>
 <office:body>
  <text:sequence-decls>
   <text:sequence-decl text:display-outline-level="0" text:name="Illustration"/>
   <text:sequence-decl text:display-outline-level="0" text:name="Table"/>
   <text:sequence-decl text:display-outline-level="0" text:name="Text"/>
   <text:sequence-decl text:display-outline-level="0" text:name="Drawing"/>
  </text:sequence-decls>
  <text:p text:style-name="Standard">This is a test document for 
     the Ooo4R project</text:p>
  <text:p text:style-name="Standard">This is the second line. It 
     has some <text:span text:style-name="Citation">text</text:span> 
     with special formatting.</text:p>
 </office:body>
 <!-- End of sample -->
</office:document-content>


Listing Two 

#!/usr/bin/env ruby
require 'rexml/parsers/pullparser'

parser = REXML::Parsers::PullParser.new( IO.read( "content1.xml" ) )

while parser.has_next?
  pull_event = parser.pull
  puts pull_event.event_type
end


Listing Three

#!/usr/bin/env ruby
require "rexml/parsers/pullparser"
parser = REXML::Parsers::PullParser.new( IO.read( "content1.xml" ) )
xml = ""
while parser.has_next?
  pull_event = parser.pull
  puts( pull_event[0] ) if pull_event.start_element?
end


Listing Four

#!/usr/bin/env ruby
require "rexml/parsers/pullparser"
parser = REXML::Parsers::PullParser.new( IO.read( "content1.xml" ) )
xml = ""

def is_standard_text_p?( event )
  return false unless event.start_element?
  return false unless event[0] == "text:p"
  event[1][ 'text:style-name'] == "Standard"
end

while parser.has_next?
  pull_event = parser.pull
  puts pull_event.inspect if is_standard_text_p? pull_event   
end


Listing Five

#!/usr/bin/env ruby
require "rexml/parsers/pullparser"

parser = REXML::Parsers::PullParser.new( IO.read( "content1.xml" ) )
results  = ""
def is_standard_text_p?( event )
  return false unless event.start_element?
  return false unless event[0] == "text:p"
  event[1][ "text:style-name" ] == "Standard"
end

def attrs_to_s( attrs )
  return "" if attrs.empty?
  " " +  attrs.to_a.map{ |attr| 
      "#{attr[0]}='#{attr[1]}'"
     }.join( " " )   
end

tag_stack = []

while parser.has_next?
  pull_event = parser.pull
  case pull_event.event_type
    when :start_element
      if is_standard_text_p? pull_event   
        tag_stack.push "p"
      else
        tag_stack.push pull_event[0]
      end
      results << "<#{tag_stack.last}#{attrs_to_s(pull_event[1])}>"
    when :end_element
      results << "</#{tag_stack.pop}>"    
    when :text
      results << pull_event[0] 
    else
      results << "<!-- #{pull_event.inspect} -->"
  end
end

puts results


Listing Six

#!/usr/bin/env ruby
require "rexml/parsers/pullparser"

parser = REXML::Parsers::PullParser.new( IO.read( "content1.xml" ) )

def is_standard_text_p?( event )
  return false unless event.start_element?
  return false unless event[0]  == "text:p"
  event[1][ "text:style-name" ] == "Standard"
end

def attrs_to_s( attrs )
  return "" if attrs.empty?
  " " +  attrs.to_a.map{ |attr| 
      #{attr[0]}='#{attr[1]}'"
      }.join( " " )   
end

def start_element( event )
  if is_standard_text_p? event   
     $tag_stack.push "p"
  else
    $tag_stack.push event[0]  
  end
  "<#{$tag_stack.last}#{attrs_to_s(event[1])}>"  
end

def end_element( event )
  "</#{$tag_stack.pop}>"         
end

def text( event )
  event[0]
end

results = ""

$tag_stack = []

while parser.has_next?
  pull_event = parser.pull
  begin
    results << send( pull_event.event_type.to_s, pull_event )
  rescue NoMethodError; end
end

puts results


Listing Seven

def dispatch
  return unless @parser.has_next?
  event = @parser.pull 
  unless event.end_document?
    send( event.event_type.to_s, event ) 
  end
end




4


