Adds support for .formats and fixes tests

This commit is contained in:
Nemo 2020-04-01 01:33:54 +05:30
parent 98da71989c
commit 2a78be1022
5 changed files with 2558 additions and 7 deletions

View File

@ -17,6 +17,8 @@ describe Muse::Dl::Book do
book.author.should eq "Albert H. Tillson, Jr."
book.date.should eq "2010"
book.publisher.should eq "University of Virginia Press"
book.formats.should contain :pdf
book.formats.should_not contain :html
end
it "should parse the summary" do
@ -50,7 +52,7 @@ describe Muse::Dl::Book do
]
end
it "it should parse the DOI for 68534" do
it "should parse book/68534" do
html = File.new("spec/fixtures/book-68534.html").gets_to_end
book = Muse::Dl::Book.new html
book.info["ISBN"].should eq "9781501737695"
@ -60,5 +62,14 @@ describe Muse::Dl::Book do
book.info["Language"].should eq "English"
book.info["Open Access"].should eq "Yes"
book.info["DOI"].should eq "10.1353/book.68534"
book.formats.should contain :html
book.formats.should_not contain :pdf
end
it "should note both formats for book/60322" do
html = File.new("spec/fixtures/book-60322.html").gets_to_end
book = Muse::Dl::Book.new html
book.formats.should contain :pdf
book.formats.should contain :html
end
end

2506
spec/fixtures/book-60322.html vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -25,7 +25,7 @@ module Muse::Dl
return
end
url = "https://muse.jhu.edu/chapter/#{chapter_id}"
url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
headers = HEADERS.merge({
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
})

View File

@ -1,5 +1,12 @@
require "myhtml"
# https://github.com/kostya/myhtml/issues/19
struct Myhtml::Node
def inner_html
String.build { |buf| children.each &.to_html(buf) }
end
end
module Muse::Dl
class InfoParser
def self.infobox(myhtml : Myhtml::Parser)
@ -16,6 +23,13 @@ module Muse::Dl
return info
end
def self.id(myhtml : Myhtml::Parser)
searchid = myhtml.css("#search_within_book_id").first
if searchid
searchid.attribute_by("value")
end
end
def self.title(myhtml : Myhtml::Parser)
myhtml.css("#book_about_info .title").map(&.inner_text).to_a[0].strip
end
@ -42,8 +56,24 @@ module Muse::Dl
end
def self.summary_html(myhtml : Myhtml::Parser)
return "TODO"
myhtml.css("#book_about_info .card_summary").map(&.tag_text).to_a[0].strip
summary_div = myhtml.css("#book_about_info .card_summary")
begin
summary_div.first.inner_html
rescue e : Exception
"NA"
end
end
def self.formats(myhtml : Myhtml::Parser)
formats = Set(Symbol).new
myhtml.css("img.icon").each do |icon|
url = icon.attribute_by("src")
if url
formats.add :html if /html/i.match url
formats.add :pdf if /pdf/i.match url
end
end
formats
end
end
end

View File

@ -13,22 +13,26 @@ module Muse::Dl
@cover_url : String
@thumbnail_url : String
@h : Myhtml::Parser
@formats : Set(Symbol)
getter :info, :title, :author, :date, :publisher, :summary, :summary_html, :cover_url, :thumbnail_url
getter :info, :title, :author, :date, :publisher, :summary, :summary_html, :cover_url, :thumbnail_url, :formats
private getter :h
def initialize(html : String)
@h = Myhtml::Parser.new html
@info = InfoParser.infobox(h)
id : String | Nil = InfoParser.id(h)
@title = InfoParser.title(h)
@author = InfoParser.author(h)
@date = InfoParser.date(h)
@publisher = InfoParser.publisher(h)
@summary = InfoParser.summary(h)
@summary_html = InfoParser.summary_html(h)
@cover_url = "TODO"
@thumbnail_url = "TODO"
@formats = InfoParser.formats(h)
# TODO: Make this work for journals as well
@cover_url = "https://muse.jhu.edu/book/#{id}/image/front_cover.jpg"
@thumbnail_url = "https://muse.jhu.edu/book/#{id}/image/front_cover.jpg?format=180"
end
end
end