TagIterator (aka Tagiter)
Simple but very useful HTML/XHTML cascading parser.
Quickly iterate through tagged markup documents like HTML and XML. TagIterator is great for quick and dirty web scrapping.
Usage
# sample html
stext = <<-EOF
<body> This is a test...
<sub> S1 </sub> <sub> S2 </sub>
<DL>
<DT> A1
<DT> A2
<DT> A3
</DL>
<DL>
<DT> B1
<DT> B2
<DT> B3
</DL>
<NEST>
<P ALIGN="R">TOP</P>
<NEST>
<P>SECOND</P>
<OL>
<LI>C1
<LI>C2
<LI>C3
<LI>C4
</OL>
</NEST>
<OL>
<LI>D1
<LI>D2
<LI>D3
<LI>D4
</OL>
</NEST>
</body>
EOF
a = TagIterator.new(stext)
a.first("body") do |y|
y.nth("dl",2) do |dl|
dl.enumtag("dt") do |t|
puts t.text.strip
end
end
y.first("nest") do |n|
n.first("p") do |c|
print c.text, ' '
puts c.attributes.collect{ |k,v| "#{k}=#{v}" }
end.next("nest") do |m|
m.first("p") do |c|
puts c.text
end.next("ol") do |o|
o.enumtag("li") do |i| puts i.text.strip end
end
end.next("ol") do |o|
o.enumtag("li") do |i| puts i.text.strip end
end
end
end
a.each_block("sub") do |y|
puts y.text.strip
end
produces
B1 B2 B3 TOP align=R SECOND C1 C2 C3 C4 D1 D2 D3 D4 S1 S2
Methods
collect
each_block
enumcollect
enumtag
first
for_this
get_first
get_nth
new
nth
nth_tailer
tagexist?
tagnext
Attributes
| [R] | attributes | |
| [RW] | option | |
| [R] | tag | |
| [R] | text |
Public Class methods
[ + ]
# File lib/more/facets/tagiterator.rb, line 120 def initialize(text,tag=nil,attributes={}) raise RuntimeError,"Only String accepted" unless text.is_a?(String) @text=text @option="pi" @tag=tag @attributes=attributes def @attributes.[](aname) super aname.downcase end end
Public Instance methods
[ + ]
# File lib/more/facets/tagiterator.rb, line 229 def collect(*arg) a=[] each_block(*arg) do |tt| a.push tt end a end
[ + ]
# File lib/more/facets/tagiterator.rb, line 205 def each_block(tag,closetag=nil) t=0 s,d =find_opentag(tag) raise RuntimeError,"tag(#{tag}) not found" unless s while s do if closetag then e=find_closetag(closetag,s,tag) else e=find_closetag(tag,s) end e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) if e>=0 then t=@text.index('>',e+1) t=@text.length unless t s,d = find_opentag(tag,t) else s=false end end self.class.new(text[t+1..-1]) end
[ + ]
# File lib/more/facets/tagiterator.rb, line 245 def enumcollect(tag) a=[] enumtag(tag) do |t| a.push t end a end
[ + ]
# File lib/more/facets/tagiterator.rb, line 235 def enumtag(tag) s,d = find_openenumtag(tag) while s do e=find_closeenumtag(tag,s+1) e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) s,d = find_openenumtag(tag,s) end end
[ + ]
# File lib/more/facets/tagiterator.rb, line 202 def first(tag,*arg) nth(tag,1,*arg) do |f| yield f end end alias_method :next, :first def each_block(tag,closetag=nil) t=0 s,d =find_opentag(tag) raise RuntimeError,"tag(#{tag}) not found" unless s while s do if closetag then e=find_closetag(closetag,s,tag) else e=find_closetag(tag,s) end e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) if e>=0 then t=@text.index('>',e+1) t=@text.length unless t s,d = find_opentag(tag,t) else s=false end end self.class.new(text[t+1..-1]) end def collect(*arg) a=[] each_block(*arg) do |tt| a.push tt end a end def enumtag(tag) s,d = find_openenumtag(tag) while s do e=find_closeenumtag(tag,s+1) e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) s,d = find_openenumtag(tag,s) end end def enumcollect(tag) a=[] enumtag(tag) do |t| a.push t end a end def for_this yield self end def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end def tagexist?(tag,st=0) s=find_element(tag,st) if s then true else false end end def tagnext s=@text.index("<") return nil unless s e=@text.index(">",s) return nil unless s @text[s..e].scan(/[^<>\s]+/)[0] end def nth_tailer(tag,n) nth(tag,n) do end end end # _____ _ # |_ _|__ ___| |_ # | |/ _ \/ __| __| # | | __/\__ \ |_ # |_|\___||___/\__| # ??
[ + ]
# File lib/more/facets/tagiterator.rb, line 251 def for_this yield self end
[ + ]
# File lib/more/facets/tagiterator.rb, line 257 def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end
[ + ]
# File lib/more/facets/tagiterator.rb, line 255 def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end
[ + ]
# File lib/more/facets/tagiterator.rb, line 179 def nth(tag,n,closetag=nil) raise RuntimeError,"nth: number not specified" unless n t=0 e=s=0 # for their scope d=nil 1.upto(n) do |i| s,d = find_opentag(tag,t) raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s if closetag then e=find_closetag(closetag,s,tag) else e=find_closetag(tag,s) end e=-1 unless e t=@text.index('>',e+1) t=@text.length unless t end yield self.class.new(text[s..e],tag,parse_attribute(d)) self.class.new(text[t+1..-1]) end
[ + ]
# File lib/more/facets/tagiterator.rb, line 272 def nth_tailer(tag,n) nth(tag,n) do end end
[ + ]
# File lib/more/facets/tagiterator.rb, line 259 def tagexist?(tag,st=0) s=find_element(tag,st) if s then true else false end end
[ + ]
# File lib/more/facets/tagiterator.rb, line 264 def tagnext s=@text.index("<") return nil unless s e=@text.index(">",s) return nil unless s @text[s..e].scan(/[^<>\s]+/)[0] end