require "../src/myhtml"
str = if filename = ARGV[0]?
File.read(filename, "UTF-8", invalid: :skip)
else
<<-HTML
<html>
<br />
<hr size="2" width="100%" />
ΠΠ°Π·Π²Π°Π½ΠΈΠ΅: <b>Π§ΡΠΎ Ρ ΡΠ΄Π΅Π»Π°Π»?</b><br />
ΠΡΠ²Π΅ΡΠΈΠ»: <b>Π§ΡΠ΄ΠΈΡΠ΅-ΠΠΌΠ΅ΠΉ</b> Π½Π° <b>21 ΠΠΊΡΡΠ±ΡΡ 2005, 18:11</b>
<hr />
<div style="margin: 0 5ex;">ΠΠ°Π²Π°ΠΉΡΠ΅ Π² ΡΡΠΎΠΉ ΡΠ΅ΠΌΠ΅ Π³ΠΎΠ²ΠΎΡΠΈΡΡ ΠΎ ΡΠΎΠΌ, ΡΡΠΎ ΡΠ΅Π³ΠΎΠ΄Π½Ρ ΠΏΡΠΎΠΈΠ·ΠΎΡΠ»ΠΎ</div>
<br />
<hr size="2" width="100%" />
ΠΠ°Π·Π²Π°Π½ΠΈΠ΅: <b>Π§ΡΠΎ Ρ ΡΠ΄Π΅Π»Π°Π»?</b><br />
ΠΡΠ²Π΅ΡΠΈΠ»: <b>Rostik</b> Π½Π° <b>21 ΠΠΊΡΡΠ±ΡΡ 2005, 18:15</b>
<hr />
<div style="margin: 0 5ex;"><b>Π§ΡΠ΄ΠΈΡΠ΅-ΠΠΌΠ΅ΠΉ</b>, Π° Π³Π΄Π΅ ΠΆ ΡΡ ΡΡΠΏΠ΅Π» ΠΏΠΎΠ»ΡΡΠΈΡΡ, Π΅ΡΠ»ΠΈ ΡΠ²ΠΈΠ»ΡΠ½ΡΠ»?</div>
<br />
</html>
HTML
end
struct Myhtml::Node
def displayble?
visible? && !object? && !is_tag_noindex?
end
end
def words(parser)
parser
.nodes(:_text) # iterate through all TEXT nodes
.select(&.parents.all?(&.displayble?)) # select only which parents are visible good tag
.map(&.tag_text) # mapping node text
.reject(&.blank?) # reject blanked texts
.map(&.strip.gsub(/\s{2,}/, " "))
end
parser = Myhtml::Parser.new(str)
puts words(parser).join(" | ")