Fixes parser for issue HTML

This also adds .journal_title as an attribute to the Issue object
This commit is contained in:
Nemo 2020-06-30 15:19:12 +05:30
parent 870ed3080d
commit 919c8ac43f
5 changed files with 1361 additions and 21 deletions

1263
spec/fixtures/issue-35852.html vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -37,4 +37,25 @@ describe Muse::Dl::Issue do
it "should parse publisher" do it "should parse publisher" do
issue.publisher.should eq "Johns Hopkins University Press" issue.publisher.should eq "Johns Hopkins University Press"
end end
it "should parse the journal title" do
issue.journal_title.should eq "portal: Libraries and the Academy"
end
it "should parse non-numbered issues" do
WebMock.stub(:get, "https://muse.jhu.edu/issue/35852")
.to_return(body: File.new("spec/fixtures/issue-35852.html").gets_to_end)
issue = Muse::Dl::Issue.new "35852"
issue.parse
issue.volume.should eq "1"
issue.number.should eq "2"
issue.date.should eq "2016"
issue.info["ISSN"].should eq "2474-9419"
issue.info["Print ISSN"].should eq "2474-9427"
issue.info["Launched on MUSE"].should eq "2017-02-21"
issue.info["Open Access"].should eq "Yes"
issue.title.should eq "Volume 1, Issue 2, 2016"
issue.journal_title.should eq "Constitutional Studies"
end
end end

View File

@ -3,8 +3,8 @@ require "./issue.cr"
module Muse::Dl module Muse::Dl
class Article class Article
@id : String getter id : String
getter :id setter title : String | Nil, start_page : Int32 | Nil, end_page : Int32 | Nil
def initialize(id : String) def initialize(id : String)
@id = id @id = id

View File

@ -1,29 +1,26 @@
require "./thing.cr" "./thing.cr"
require "./fetch.cr" require "./fetch.cr"
require "./article.cr" require "./article.cr"
module Muse::Dl module Muse::Dl
class Issue class Issue
@id : String getter id : String,
@title : String | Nil title : String | Nil,
@articles : Array(Muse::Dl::Article) articles : Array(Muse::Dl::Article),
@url : String url : String,
@info : Hash(String, String) summary : String | Nil,
@summary : String | Nil publisher : String | Nil,
@publisher : String | Nil info : Hash(String, String),
@volume : String | Nil volume : String | Nil,
@number : String | Nil number : String | Nil,
@date : String | Nil date : String | Nil,
@issues : Array(Muse::Dl::Issue) journal_title : String | Nil
getter :id, :title, :articles, :url, :summary, :publisher, :info, :volume, :number, :date
def initialize(id : String) def initialize(id : String)
@id = id @id = id
@url = "https://muse.jhu.edu/issue/#{id}" @url = "https://muse.jhu.edu/issue/#{id}"
@info = Hash(String, String).new @info = Hash(String, String).new
@articles = [] of Muse::Dl::Article @articles = [] of Muse::Dl::Article
@issues = [] of Muse::Dl::Issue
end end
def parse def parse
@ -42,17 +39,38 @@ module Muse::Dl
unless t.nil? unless t.nil?
@volume = /Volume (\d+)/.match(t).try &.[1] @volume = /Volume (\d+)/.match(t).try &.[1]
@number = /Number (\d+)/.match(t).try &.[1] @number = /Number (\d+)/.match(t).try &.[1]
@number = /Issue (\d+)/.match(t).try &.[1] unless @number
@date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1] @date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1]
@date = /(\d{4})/.match(t).try &.[1] unless @date
end end
end end
def parse_contents(myhtml : Myhtml::Parser) def parse_contents(myhtml : Myhtml::Parser)
myhtml.css("#available_issues_list_text a").each do |a| journal_title_a = myhtml.css("#journal_banner_title a").first
link = a.attribute_by("href").to_s if journal_title_a
@journal_title = journal_title_a.inner_text
end
myhtml.css(".articles_list_text ol").each do |ol|
link = ol.css("li.title a").first
title = link.inner_text
matches = /\/issue\/(\d+)/.match link pages = ol.css("li.pg").first.try &.inner_text
matches = /(\d+)-(\d+)/.match pages
if matches if matches
@issues.push Muse::Dl::Issue.new matches[1] start_page = matches[1].to_i
end_page = matches[2].to_i
end
ol.css("a").each do |l|
url = l.attribute_by("href").to_s
matches = /\/article\/(\d+)\/pdf/.match url
if matches
a = Muse::Dl::Article.new matches[1]
a.title = title
a.start_page = start_page if start_page
a.end_page = end_page if end_page
@articles.push a
end
end end
end end
end end

View File

@ -71,6 +71,44 @@ module Muse::Dl
# Needed because of https://github.com/crystal-lang/crystal/issues/7777 # Needed because of https://github.com/crystal-lang/crystal/issues/7777
FileUtils.cp source, destination FileUtils.cp source, destination
FileUtils.rm source if parser.cleanup FileUtils.rm source if parser.cleanup
elsif thing.is_a? Muse::Dl::Issue
# Will have no effect if parser has a custom title
parser.output = Util.slug_filename "#{thing.title}.pdf"
# If file exists and we can't clobber
if File.exists?(parser.output) && parser.clobber == false
STDERR.puts "Skipping #{url}, File already exists: #{parser.output}"
return
end
temp_stitched_file = nil
pdf_builder = Pdftk.new(parser.tmp)
# ## TODO till 111
thing.issues.each do |issue|
begin
Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first)
rescue e : Muse::Dl::Errors::MuseCorruptPDF
STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
return
end
end
chapter_ids = thing.chapters.map { |c| c[0] }
# Stitch the PDFs together
temp_stitched_file = pdf_builder.stitch chapter_ids
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
temp_stitched_file.delete if temp_stitched_file
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
puts "DL: #{url}. Saved final output to #{parser.output}"
# Cleanup the chapter files
if parser.cleanup
thing.chapters.each do |c|
Fetch.cleanup(parser.tmp, c[0])
end
end
####
end end
end end