mirror of https://github.com/captn3m0/muse-dl.git
Fixes parser for issue HTML
This also adds .journal_title as an attribute to the Issue object
This commit is contained in:
parent
870ed3080d
commit
919c8ac43f
File diff suppressed because it is too large
Load Diff
|
@ -37,4 +37,25 @@ describe Muse::Dl::Issue do
|
||||||
it "should parse publisher" do
|
it "should parse publisher" do
|
||||||
issue.publisher.should eq "Johns Hopkins University Press"
|
issue.publisher.should eq "Johns Hopkins University Press"
|
||||||
end
|
end
|
||||||
|
it "should parse the journal title" do
|
||||||
|
issue.journal_title.should eq "portal: Libraries and the Academy"
|
||||||
|
end
|
||||||
|
|
||||||
|
it "should parse non-numbered issues" do
|
||||||
|
WebMock.stub(:get, "https://muse.jhu.edu/issue/35852")
|
||||||
|
.to_return(body: File.new("spec/fixtures/issue-35852.html").gets_to_end)
|
||||||
|
issue = Muse::Dl::Issue.new "35852"
|
||||||
|
issue.parse
|
||||||
|
|
||||||
|
issue.volume.should eq "1"
|
||||||
|
issue.number.should eq "2"
|
||||||
|
issue.date.should eq "2016"
|
||||||
|
|
||||||
|
issue.info["ISSN"].should eq "2474-9419"
|
||||||
|
issue.info["Print ISSN"].should eq "2474-9427"
|
||||||
|
issue.info["Launched on MUSE"].should eq "2017-02-21"
|
||||||
|
issue.info["Open Access"].should eq "Yes"
|
||||||
|
issue.title.should eq "Volume 1, Issue 2, 2016"
|
||||||
|
issue.journal_title.should eq "Constitutional Studies"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -3,8 +3,8 @@ require "./issue.cr"
|
||||||
|
|
||||||
module Muse::Dl
|
module Muse::Dl
|
||||||
class Article
|
class Article
|
||||||
@id : String
|
getter id : String
|
||||||
getter :id
|
setter title : String | Nil, start_page : Int32 | Nil, end_page : Int32 | Nil
|
||||||
|
|
||||||
def initialize(id : String)
|
def initialize(id : String)
|
||||||
@id = id
|
@id = id
|
||||||
|
|
56
src/issue.cr
56
src/issue.cr
|
@ -1,29 +1,26 @@
|
||||||
require "./thing.cr"
|
"./thing.cr"
|
||||||
require "./fetch.cr"
|
require "./fetch.cr"
|
||||||
require "./article.cr"
|
require "./article.cr"
|
||||||
|
|
||||||
module Muse::Dl
|
module Muse::Dl
|
||||||
class Issue
|
class Issue
|
||||||
@id : String
|
getter id : String,
|
||||||
@title : String | Nil
|
title : String | Nil,
|
||||||
@articles : Array(Muse::Dl::Article)
|
articles : Array(Muse::Dl::Article),
|
||||||
@url : String
|
url : String,
|
||||||
@info : Hash(String, String)
|
summary : String | Nil,
|
||||||
@summary : String | Nil
|
publisher : String | Nil,
|
||||||
@publisher : String | Nil
|
info : Hash(String, String),
|
||||||
@volume : String | Nil
|
volume : String | Nil,
|
||||||
@number : String | Nil
|
number : String | Nil,
|
||||||
@date : String | Nil
|
date : String | Nil,
|
||||||
@issues : Array(Muse::Dl::Issue)
|
journal_title : String | Nil
|
||||||
|
|
||||||
getter :id, :title, :articles, :url, :summary, :publisher, :info, :volume, :number, :date
|
|
||||||
|
|
||||||
def initialize(id : String)
|
def initialize(id : String)
|
||||||
@id = id
|
@id = id
|
||||||
@url = "https://muse.jhu.edu/issue/#{id}"
|
@url = "https://muse.jhu.edu/issue/#{id}"
|
||||||
@info = Hash(String, String).new
|
@info = Hash(String, String).new
|
||||||
@articles = [] of Muse::Dl::Article
|
@articles = [] of Muse::Dl::Article
|
||||||
@issues = [] of Muse::Dl::Issue
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def parse
|
def parse
|
||||||
|
@ -42,17 +39,38 @@ module Muse::Dl
|
||||||
unless t.nil?
|
unless t.nil?
|
||||||
@volume = /Volume (\d+)/.match(t).try &.[1]
|
@volume = /Volume (\d+)/.match(t).try &.[1]
|
||||||
@number = /Number (\d+)/.match(t).try &.[1]
|
@number = /Number (\d+)/.match(t).try &.[1]
|
||||||
|
@number = /Issue (\d+)/.match(t).try &.[1] unless @number
|
||||||
@date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1]
|
@date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1]
|
||||||
|
@date = /(\d{4})/.match(t).try &.[1] unless @date
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def parse_contents(myhtml : Myhtml::Parser)
|
def parse_contents(myhtml : Myhtml::Parser)
|
||||||
myhtml.css("#available_issues_list_text a").each do |a|
|
journal_title_a = myhtml.css("#journal_banner_title a").first
|
||||||
link = a.attribute_by("href").to_s
|
if journal_title_a
|
||||||
|
@journal_title = journal_title_a.inner_text
|
||||||
|
end
|
||||||
|
myhtml.css(".articles_list_text ol").each do |ol|
|
||||||
|
link = ol.css("li.title a").first
|
||||||
|
title = link.inner_text
|
||||||
|
|
||||||
matches = /\/issue\/(\d+)/.match link
|
pages = ol.css("li.pg").first.try &.inner_text
|
||||||
|
matches = /(\d+)-(\d+)/.match pages
|
||||||
if matches
|
if matches
|
||||||
@issues.push Muse::Dl::Issue.new matches[1]
|
start_page = matches[1].to_i
|
||||||
|
end_page = matches[2].to_i
|
||||||
|
end
|
||||||
|
|
||||||
|
ol.css("a").each do |l|
|
||||||
|
url = l.attribute_by("href").to_s
|
||||||
|
matches = /\/article\/(\d+)\/pdf/.match url
|
||||||
|
if matches
|
||||||
|
a = Muse::Dl::Article.new matches[1]
|
||||||
|
a.title = title
|
||||||
|
a.start_page = start_page if start_page
|
||||||
|
a.end_page = end_page if end_page
|
||||||
|
@articles.push a
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -71,6 +71,44 @@ module Muse::Dl
|
||||||
# Needed because of https://github.com/crystal-lang/crystal/issues/7777
|
# Needed because of https://github.com/crystal-lang/crystal/issues/7777
|
||||||
FileUtils.cp source, destination
|
FileUtils.cp source, destination
|
||||||
FileUtils.rm source if parser.cleanup
|
FileUtils.rm source if parser.cleanup
|
||||||
|
elsif thing.is_a? Muse::Dl::Issue
|
||||||
|
# Will have no effect if parser has a custom title
|
||||||
|
parser.output = Util.slug_filename "#{thing.title}.pdf"
|
||||||
|
|
||||||
|
# If file exists and we can't clobber
|
||||||
|
if File.exists?(parser.output) && parser.clobber == false
|
||||||
|
STDERR.puts "Skipping #{url}, File already exists: #{parser.output}"
|
||||||
|
return
|
||||||
|
end
|
||||||
|
temp_stitched_file = nil
|
||||||
|
pdf_builder = Pdftk.new(parser.tmp)
|
||||||
|
|
||||||
|
# ## TODO till 111
|
||||||
|
thing.issues.each do |issue|
|
||||||
|
begin
|
||||||
|
Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first)
|
||||||
|
rescue e : Muse::Dl::Errors::MuseCorruptPDF
|
||||||
|
STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
|
||||||
|
return
|
||||||
|
end
|
||||||
|
end
|
||||||
|
chapter_ids = thing.chapters.map { |c| c[0] }
|
||||||
|
|
||||||
|
# Stitch the PDFs together
|
||||||
|
temp_stitched_file = pdf_builder.stitch chapter_ids
|
||||||
|
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
||||||
|
|
||||||
|
temp_stitched_file.delete if temp_stitched_file
|
||||||
|
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
|
||||||
|
puts "DL: #{url}. Saved final output to #{parser.output}"
|
||||||
|
|
||||||
|
# Cleanup the chapter files
|
||||||
|
if parser.cleanup
|
||||||
|
thing.chapters.each do |c|
|
||||||
|
Fetch.cleanup(parser.tmp, c[0])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
####
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue