Adds support for .formats and fixes tests

This commit is contained in:
Nemo 2020-04-01 01:33:54 +05:30
parent 98da71989c
commit 2a78be1022
5 changed files with 2558 additions and 7 deletions

View File

@ -17,6 +17,8 @@ describe Muse::Dl::Book do
book.author.should eq "Albert H. Tillson, Jr." book.author.should eq "Albert H. Tillson, Jr."
book.date.should eq "2010" book.date.should eq "2010"
book.publisher.should eq "University of Virginia Press" book.publisher.should eq "University of Virginia Press"
book.formats.should contain :pdf
book.formats.should_not contain :html
end end
it "should parse the summary" do it "should parse the summary" do
@ -50,7 +52,7 @@ describe Muse::Dl::Book do
] ]
end end
it "it should parse the DOI for 68534" do it "should parse book/68534" do
html = File.new("spec/fixtures/book-68534.html").gets_to_end html = File.new("spec/fixtures/book-68534.html").gets_to_end
book = Muse::Dl::Book.new html book = Muse::Dl::Book.new html
book.info["ISBN"].should eq "9781501737695" book.info["ISBN"].should eq "9781501737695"
@ -60,5 +62,14 @@ describe Muse::Dl::Book do
book.info["Language"].should eq "English" book.info["Language"].should eq "English"
book.info["Open Access"].should eq "Yes" book.info["Open Access"].should eq "Yes"
book.info["DOI"].should eq "10.1353/book.68534" book.info["DOI"].should eq "10.1353/book.68534"
book.formats.should contain :html
book.formats.should_not contain :pdf
end
it "should note both formats for book/60322" do
html = File.new("spec/fixtures/book-60322.html").gets_to_end
book = Muse::Dl::Book.new html
book.formats.should contain :pdf
book.formats.should contain :html
end end
end end

2506
spec/fixtures/book-60322.html vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -25,7 +25,7 @@ module Muse::Dl
return return
end end
url = "https://muse.jhu.edu/chapter/#{chapter_id}" url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
headers = HEADERS.merge({ headers = HEADERS.merge({
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
}) })

View File

@ -1,5 +1,12 @@
require "myhtml" require "myhtml"
# https://github.com/kostya/myhtml/issues/19
struct Myhtml::Node
def inner_html
String.build { |buf| children.each &.to_html(buf) }
end
end
module Muse::Dl module Muse::Dl
class InfoParser class InfoParser
def self.infobox(myhtml : Myhtml::Parser) def self.infobox(myhtml : Myhtml::Parser)
@ -16,6 +23,13 @@ module Muse::Dl
return info return info
end end
def self.id(myhtml : Myhtml::Parser)
searchid = myhtml.css("#search_within_book_id").first
if searchid
searchid.attribute_by("value")
end
end
def self.title(myhtml : Myhtml::Parser) def self.title(myhtml : Myhtml::Parser)
myhtml.css("#book_about_info .title").map(&.inner_text).to_a[0].strip myhtml.css("#book_about_info .title").map(&.inner_text).to_a[0].strip
end end
@ -42,8 +56,24 @@ module Muse::Dl
end end
def self.summary_html(myhtml : Myhtml::Parser) def self.summary_html(myhtml : Myhtml::Parser)
return "TODO" summary_div = myhtml.css("#book_about_info .card_summary")
myhtml.css("#book_about_info .card_summary").map(&.tag_text).to_a[0].strip begin
summary_div.first.inner_html
rescue e : Exception
"NA"
end
end
def self.formats(myhtml : Myhtml::Parser)
formats = Set(Symbol).new
myhtml.css("img.icon").each do |icon|
url = icon.attribute_by("src")
if url
formats.add :html if /html/i.match url
formats.add :pdf if /pdf/i.match url
end
end
formats
end end
end end
end end

View File

@ -13,22 +13,26 @@ module Muse::Dl
@cover_url : String @cover_url : String
@thumbnail_url : String @thumbnail_url : String
@h : Myhtml::Parser @h : Myhtml::Parser
@formats : Set(Symbol)
getter :info, :title, :author, :date, :publisher, :summary, :summary_html, :cover_url, :thumbnail_url getter :info, :title, :author, :date, :publisher, :summary, :summary_html, :cover_url, :thumbnail_url, :formats
private getter :h private getter :h
def initialize(html : String) def initialize(html : String)
@h = Myhtml::Parser.new html @h = Myhtml::Parser.new html
@info = InfoParser.infobox(h) @info = InfoParser.infobox(h)
id : String | Nil = InfoParser.id(h)
@title = InfoParser.title(h) @title = InfoParser.title(h)
@author = InfoParser.author(h) @author = InfoParser.author(h)
@date = InfoParser.date(h) @date = InfoParser.date(h)
@publisher = InfoParser.publisher(h) @publisher = InfoParser.publisher(h)
@summary = InfoParser.summary(h) @summary = InfoParser.summary(h)
@summary_html = InfoParser.summary_html(h) @summary_html = InfoParser.summary_html(h)
@cover_url = "TODO" @formats = InfoParser.formats(h)
@thumbnail_url = "TODO" # TODO: Make this work for journals as well
@cover_url = "https://muse.jhu.edu/book/#{id}/image/front_cover.jpg"
@thumbnail_url = "https://muse.jhu.edu/book/#{id}/image/front_cover.jpg?format=180"
end end
end end
end end