Finishes support for downloading complete issues

This commit is contained in:
Nemo 2020-06-30 17:36:44 +05:30
parent 38db0dd000
commit 62e6a21c84
7 changed files with 107 additions and 31 deletions

View File

@ -1,4 +0,0 @@
module Muse::Dl::Errors
class MissingChapter < Exception
end
end

View File

@ -0,0 +1,4 @@
module Muse::Dl::Errors
class MissingFile < Exception
end
end

View File

@ -132,7 +132,7 @@ module Muse::Dl
when "journal"
return Muse::Dl::Journal.new response
when "issue"
return Muse::Dl::Issue.new response
return Muse::Dl::Issue.new match[2], response
when "article"
return Muse::Dl::Article.new match[2]
end

View File

@ -35,7 +35,11 @@ module Muse::Dl
end
def self.issue_title(myhtml : Myhtml::Parser)
myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
begin
myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
rescue
nil
end
end
def self.author(myhtml : Myhtml::Parser)

View File

@ -16,15 +16,15 @@ module Muse::Dl
date : String | Nil,
journal_title : String | Nil
def initialize(id : String)
def initialize(id : String, response : String | Nil = nil)
@id = id
@url = "https://muse.jhu.edu/issue/#{id}"
@info = Hash(String, String).new
@articles = [] of Muse::Dl::Article
parse(response) if response
@info = Hash(String, String).new
end
def parse
html = Crest.get(url).to_s
def parse(html : String)
h = Myhtml::Parser.new html
@info = InfoParser.infobox(h)
@title = InfoParser.issue_title(h)

View File

@ -47,7 +47,7 @@ module Muse::Dl
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
temp_stitched_file.delete if temp_stitched_file
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
puts "DL: #{url}. Saved final output to #{parser.output}"
# Cleanup the chapter files
@ -73,7 +73,7 @@ module Muse::Dl
FileUtils.rm source if parser.cleanup
elsif thing.is_a? Muse::Dl::Issue
# Will have no effect if parser has a custom title
parser.output = Util.slug_filename "#{thing.title}.pdf"
parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"
# If file exists and we can't clobber
if File.exists?(parser.output) && parser.clobber == false
@ -84,30 +84,32 @@ module Muse::Dl
pdf_builder = Pdftk.new(parser.tmp)
# ## TODO till 111
thing.issues.each do |issue|
thing.articles.each do |article|
begin
Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first)
Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
rescue e : Muse::Dl::Errors::MuseCorruptPDF
STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
return
end
end
chapter_ids = thing.chapters.map { |c| c[0] }
article_ids = thing.articles.map { |a| a.id }
# Stitch the PDFs together
temp_stitched_file = pdf_builder.stitch chapter_ids
temp_stitched_file = pdf_builder.stitch_articles article_ids
# TODO: Add metadata for each Issue
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
temp_stitched_file.delete if temp_stitched_file
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
# temp_stitched_file.delete if temp_stitched_file
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
puts "DL: #{url}. Saved final output to #{parser.output}"
# Cleanup the chapter files
if parser.cleanup
thing.chapters.each do |c|
Fetch.cleanup(parser.tmp, c[0])
end
end
# TODO
# if parser.cleanup
# thing.articles.each do |c|
# Fetch.cleanup(parser.tmp, c[0])
# end
# end
####
end
end

View File

@ -70,7 +70,6 @@ module Muse::Dl
def add_metadata(input_file : File, output_file : String, book : Book)
# First we have to dump the current metadata
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
keywords = "Publisher:#{book.publisher}, Published:#{book.date}"
# Known Info keys, if they are present
@ -80,7 +79,12 @@ module Muse::Dl
end
end
text = <<-EOT
metadata_text = gen_metadata(book.title, keywords, book.summary.gsub(/\n\s+/, " "), book.author)
write_metadata(input_file, output_file, metadata_text)
end
def gen_metadata(title : String, keywords : String, subject : String, author : String | Nil = nil)
metadata = <<-EOT
InfoBegin
InfoKey: Creator
InfoValue:
@ -89,25 +93,37 @@ module Muse::Dl
InfoValue:
InfoBegin
InfoKey: Title
InfoValue: #{book.title}
InfoValue: #{title}
InfoBegin
InfoKey: Keywords
InfoValue: #{keywords}
InfoBegin
InfoKey: Author
InfoValue: #{book.author}
InfoBegin
InfoKey: Subject
InfoValue: #{book.summary.gsub(/\n\s+/, " ")}
InfoValue: #{subject}
InfoBegin
InfoKey: ModDate
InfoValue:
InfoBegin
InfoKey: CreationDate
InfoValue:
EOT
unless author.nil?
metadata += <<-EOT
InfoBegin
InfoKey: Author
InfoValue: #{author}
EOT
end
return metadata
end
def write_metadata(input_file : File, output_file : String, text)
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
File.write(metadata_text_file.path, text)
is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
if !is_success
raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.")
@ -115,11 +131,42 @@ module Muse::Dl
metadata_text_file.delete
end
def add_metadata(input_file : File, output_file : String, issue : Issue)
# First we have to dump the current metadata
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
keywords = "Journal:#{issue.journal_title}, Published:#{issue.date},Volume:#{issue.volume},Number:#{issue.number}"
["ISSN", "Print ISSN", "DOI", "Language", "Open Access"].each do |label|
if issue.info.has_key? label
keywords += ", #{label}:#{issue.info[label]}"
end
end
# TODO: Move this to Issue class
s = issue.summary
unless s.nil?
summary = s.gsub(/\n\s+/, " ")
else
summary = "NA"
end
t = issue.title
unless t.nil?
title = t
else
title = "NA"
end
# TODO: Add support for all authors in the PDF
metadata = gen_metadata(title, keywords, summary)
write_metadata(input_file, output_file, metadata)
end
def stitch(chapter_ids : Array(String))
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
# Do some sanity checks on each Chapter PDF
chapter_ids.each do |id|
raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
end
@ -136,5 +183,28 @@ module Muse::Dl
return output_file
end
# TODO: Merge with stitch
def stitch_articles(article_ids : Array(String))
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
# Do some sanity checks on each Chapter PDF
article_ids.each do |id|
raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.article_file_name(id, @tmp_file_path)
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.article_file_name(id, @tmp_file_path)) > 0
end
# Now let's stitch them together
article_files = article_ids.map { |id| Fetch.article_file_name(id, @tmp_file_path) }
args = article_files + ["cat", "output", output_file.path]
is_success = execute args
# TODO: Validate final file here
if !is_success
puts args
raise Muse::Dl::Errors::PDFOperationError.new("Error stitching articles together.")
end
return output_file
end
end
end