class HTMLTokenizer

A class to tokenize HTML.

Example:

page = "<HTML>
<HEAD>
<TITLE>This is the title</TITLE>
</HEAD>
 <!-- Here comes the <a href=\"missing.link\">blah</a>
 comment body
  -->
 <BODY>
   <H1>This is the header</H1>
   <P>
     This is the paragraph, it contains
     <a href=\"link.html\">links</a>,
     <img src=\"blah.gif\" optional alt='images
     are
     really cool'>.  Ok, here is some more text and
     <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
   </P>
 </body>
 </HTML>
 "
 toke = HTMLTokenizer.new(page)

 assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
 assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
 assert("links" == toke.getTrimmedText)
 assert(toke.getTag("IMG", "A").attr_hash['optional'])
 assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])