mirror of https://github.com/captn3m0/muse-dl.git
Finishes support for downloading complete issues
This commit is contained in:
parent
38db0dd000
commit
62e6a21c84
|
@ -1,4 +0,0 @@
|
|||
module Muse::Dl::Errors
|
||||
class MissingChapter < Exception
|
||||
end
|
||||
end
|
|
@ -0,0 +1,4 @@
|
|||
module Muse::Dl::Errors
|
||||
class MissingFile < Exception
|
||||
end
|
||||
end
|
|
@ -132,7 +132,7 @@ module Muse::Dl
|
|||
when "journal"
|
||||
return Muse::Dl::Journal.new response
|
||||
when "issue"
|
||||
return Muse::Dl::Issue.new response
|
||||
return Muse::Dl::Issue.new match[2], response
|
||||
when "article"
|
||||
return Muse::Dl::Article.new match[2]
|
||||
end
|
||||
|
|
|
@ -35,7 +35,11 @@ module Muse::Dl
|
|||
end
|
||||
|
||||
def self.issue_title(myhtml : Myhtml::Parser)
|
||||
myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
|
||||
begin
|
||||
myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
|
||||
rescue
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def self.author(myhtml : Myhtml::Parser)
|
||||
|
|
|
@ -16,15 +16,15 @@ module Muse::Dl
|
|||
date : String | Nil,
|
||||
journal_title : String | Nil
|
||||
|
||||
def initialize(id : String)
|
||||
def initialize(id : String, response : String | Nil = nil)
|
||||
@id = id
|
||||
@url = "https://muse.jhu.edu/issue/#{id}"
|
||||
@info = Hash(String, String).new
|
||||
@articles = [] of Muse::Dl::Article
|
||||
parse(response) if response
|
||||
@info = Hash(String, String).new
|
||||
end
|
||||
|
||||
def parse
|
||||
html = Crest.get(url).to_s
|
||||
def parse(html : String)
|
||||
h = Myhtml::Parser.new html
|
||||
@info = InfoParser.infobox(h)
|
||||
@title = InfoParser.issue_title(h)
|
||||
|
|
|
@ -47,7 +47,7 @@ module Muse::Dl
|
|||
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
||||
|
||||
temp_stitched_file.delete if temp_stitched_file
|
||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
|
||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
|
||||
puts "DL: #{url}. Saved final output to #{parser.output}"
|
||||
|
||||
# Cleanup the chapter files
|
||||
|
@ -73,7 +73,7 @@ module Muse::Dl
|
|||
FileUtils.rm source if parser.cleanup
|
||||
elsif thing.is_a? Muse::Dl::Issue
|
||||
# Will have no effect if parser has a custom title
|
||||
parser.output = Util.slug_filename "#{thing.title}.pdf"
|
||||
parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"
|
||||
|
||||
# If file exists and we can't clobber
|
||||
if File.exists?(parser.output) && parser.clobber == false
|
||||
|
@ -84,30 +84,32 @@ module Muse::Dl
|
|||
pdf_builder = Pdftk.new(parser.tmp)
|
||||
|
||||
# ## TODO till 111
|
||||
thing.issues.each do |issue|
|
||||
thing.articles.each do |article|
|
||||
begin
|
||||
Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first)
|
||||
Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
|
||||
rescue e : Muse::Dl::Errors::MuseCorruptPDF
|
||||
STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
|
||||
return
|
||||
end
|
||||
end
|
||||
chapter_ids = thing.chapters.map { |c| c[0] }
|
||||
article_ids = thing.articles.map { |a| a.id }
|
||||
|
||||
# Stitch the PDFs together
|
||||
temp_stitched_file = pdf_builder.stitch chapter_ids
|
||||
temp_stitched_file = pdf_builder.stitch_articles article_ids
|
||||
# TODO: Add metadata for each Issue
|
||||
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
||||
|
||||
temp_stitched_file.delete if temp_stitched_file
|
||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
|
||||
# temp_stitched_file.delete if temp_stitched_file
|
||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
|
||||
puts "DL: #{url}. Saved final output to #{parser.output}"
|
||||
|
||||
# Cleanup the chapter files
|
||||
if parser.cleanup
|
||||
thing.chapters.each do |c|
|
||||
Fetch.cleanup(parser.tmp, c[0])
|
||||
end
|
||||
end
|
||||
# TODO
|
||||
# if parser.cleanup
|
||||
# thing.articles.each do |c|
|
||||
# Fetch.cleanup(parser.tmp, c[0])
|
||||
# end
|
||||
# end
|
||||
####
|
||||
end
|
||||
end
|
||||
|
|
86
src/pdftk.cr
86
src/pdftk.cr
|
@ -70,7 +70,6 @@ module Muse::Dl
|
|||
|
||||
def add_metadata(input_file : File, output_file : String, book : Book)
|
||||
# First we have to dump the current metadata
|
||||
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
||||
keywords = "Publisher:#{book.publisher}, Published:#{book.date}"
|
||||
|
||||
# Known Info keys, if they are present
|
||||
|
@ -80,7 +79,12 @@ module Muse::Dl
|
|||
end
|
||||
end
|
||||
|
||||
text = <<-EOT
|
||||
metadata_text = gen_metadata(book.title, keywords, book.summary.gsub(/\n\s+/, " "), book.author)
|
||||
write_metadata(input_file, output_file, metadata_text)
|
||||
end
|
||||
|
||||
def gen_metadata(title : String, keywords : String, subject : String, author : String | Nil = nil)
|
||||
metadata = <<-EOT
|
||||
InfoBegin
|
||||
InfoKey: Creator
|
||||
InfoValue:
|
||||
|
@ -89,25 +93,37 @@ module Muse::Dl
|
|||
InfoValue:
|
||||
InfoBegin
|
||||
InfoKey: Title
|
||||
InfoValue: #{book.title}
|
||||
InfoValue: #{title}
|
||||
InfoBegin
|
||||
InfoKey: Keywords
|
||||
InfoValue: #{keywords}
|
||||
InfoBegin
|
||||
InfoKey: Author
|
||||
InfoValue: #{book.author}
|
||||
InfoBegin
|
||||
InfoKey: Subject
|
||||
InfoValue: #{book.summary.gsub(/\n\s+/, " ")}
|
||||
InfoValue: #{subject}
|
||||
InfoBegin
|
||||
InfoKey: ModDate
|
||||
InfoValue:
|
||||
InfoBegin
|
||||
InfoKey: CreationDate
|
||||
InfoValue:
|
||||
|
||||
EOT
|
||||
|
||||
unless author.nil?
|
||||
metadata += <<-EOT
|
||||
InfoBegin
|
||||
InfoKey: Author
|
||||
InfoValue: #{author}
|
||||
EOT
|
||||
end
|
||||
|
||||
return metadata
|
||||
end
|
||||
|
||||
def write_metadata(input_file : File, output_file : String, text)
|
||||
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
||||
File.write(metadata_text_file.path, text)
|
||||
|
||||
is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
|
||||
if !is_success
|
||||
raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.")
|
||||
|
@ -115,11 +131,42 @@ module Muse::Dl
|
|||
metadata_text_file.delete
|
||||
end
|
||||
|
||||
def add_metadata(input_file : File, output_file : String, issue : Issue)
|
||||
# First we have to dump the current metadata
|
||||
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
||||
keywords = "Journal:#{issue.journal_title}, Published:#{issue.date},Volume:#{issue.volume},Number:#{issue.number}"
|
||||
["ISSN", "Print ISSN", "DOI", "Language", "Open Access"].each do |label|
|
||||
if issue.info.has_key? label
|
||||
keywords += ", #{label}:#{issue.info[label]}"
|
||||
end
|
||||
end
|
||||
|
||||
# TODO: Move this to Issue class
|
||||
|
||||
s = issue.summary
|
||||
unless s.nil?
|
||||
summary = s.gsub(/\n\s+/, " ")
|
||||
else
|
||||
summary = "NA"
|
||||
end
|
||||
|
||||
t = issue.title
|
||||
|
||||
unless t.nil?
|
||||
title = t
|
||||
else
|
||||
title = "NA"
|
||||
end
|
||||
# TODO: Add support for all authors in the PDF
|
||||
metadata = gen_metadata(title, keywords, summary)
|
||||
write_metadata(input_file, output_file, metadata)
|
||||
end
|
||||
|
||||
def stitch(chapter_ids : Array(String))
|
||||
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
|
||||
# Do some sanity checks on each Chapter PDF
|
||||
chapter_ids.each do |id|
|
||||
raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
|
||||
raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
|
||||
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
|
||||
end
|
||||
|
||||
|
@ -136,5 +183,28 @@ module Muse::Dl
|
|||
|
||||
return output_file
|
||||
end
|
||||
|
||||
# TODO: Merge with stitch
|
||||
def stitch_articles(article_ids : Array(String))
|
||||
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
|
||||
# Do some sanity checks on each Chapter PDF
|
||||
article_ids.each do |id|
|
||||
raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.article_file_name(id, @tmp_file_path)
|
||||
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.article_file_name(id, @tmp_file_path)) > 0
|
||||
end
|
||||
|
||||
# Now let's stitch them together
|
||||
article_files = article_ids.map { |id| Fetch.article_file_name(id, @tmp_file_path) }
|
||||
args = article_files + ["cat", "output", output_file.path]
|
||||
is_success = execute args
|
||||
|
||||
# TODO: Validate final file here
|
||||
if !is_success
|
||||
puts args
|
||||
raise Muse::Dl::Errors::PDFOperationError.new("Error stitching articles together.")
|
||||
end
|
||||
|
||||
return output_file
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue