mirror of https://github.com/captn3m0/muse-dl.git
Finishes support for downloading complete issues
This commit is contained in:
parent
38db0dd000
commit
62e6a21c84
|
@ -1,4 +0,0 @@
|
||||||
module Muse::Dl::Errors
|
|
||||||
class MissingChapter < Exception
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
module Muse::Dl::Errors
|
||||||
|
class MissingFile < Exception
|
||||||
|
end
|
||||||
|
end
|
|
@ -132,7 +132,7 @@ module Muse::Dl
|
||||||
when "journal"
|
when "journal"
|
||||||
return Muse::Dl::Journal.new response
|
return Muse::Dl::Journal.new response
|
||||||
when "issue"
|
when "issue"
|
||||||
return Muse::Dl::Issue.new response
|
return Muse::Dl::Issue.new match[2], response
|
||||||
when "article"
|
when "article"
|
||||||
return Muse::Dl::Article.new match[2]
|
return Muse::Dl::Article.new match[2]
|
||||||
end
|
end
|
||||||
|
|
|
@ -35,7 +35,11 @@ module Muse::Dl
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.issue_title(myhtml : Myhtml::Parser)
|
def self.issue_title(myhtml : Myhtml::Parser)
|
||||||
myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
|
begin
|
||||||
|
myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
|
||||||
|
rescue
|
||||||
|
nil
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.author(myhtml : Myhtml::Parser)
|
def self.author(myhtml : Myhtml::Parser)
|
||||||
|
|
|
@ -16,15 +16,15 @@ module Muse::Dl
|
||||||
date : String | Nil,
|
date : String | Nil,
|
||||||
journal_title : String | Nil
|
journal_title : String | Nil
|
||||||
|
|
||||||
def initialize(id : String)
|
def initialize(id : String, response : String | Nil = nil)
|
||||||
@id = id
|
@id = id
|
||||||
@url = "https://muse.jhu.edu/issue/#{id}"
|
@url = "https://muse.jhu.edu/issue/#{id}"
|
||||||
@info = Hash(String, String).new
|
|
||||||
@articles = [] of Muse::Dl::Article
|
@articles = [] of Muse::Dl::Article
|
||||||
|
parse(response) if response
|
||||||
|
@info = Hash(String, String).new
|
||||||
end
|
end
|
||||||
|
|
||||||
def parse
|
def parse(html : String)
|
||||||
html = Crest.get(url).to_s
|
|
||||||
h = Myhtml::Parser.new html
|
h = Myhtml::Parser.new html
|
||||||
@info = InfoParser.infobox(h)
|
@info = InfoParser.infobox(h)
|
||||||
@title = InfoParser.issue_title(h)
|
@title = InfoParser.issue_title(h)
|
||||||
|
|
|
@ -47,7 +47,7 @@ module Muse::Dl
|
||||||
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
||||||
|
|
||||||
temp_stitched_file.delete if temp_stitched_file
|
temp_stitched_file.delete if temp_stitched_file
|
||||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
|
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
|
||||||
puts "DL: #{url}. Saved final output to #{parser.output}"
|
puts "DL: #{url}. Saved final output to #{parser.output}"
|
||||||
|
|
||||||
# Cleanup the chapter files
|
# Cleanup the chapter files
|
||||||
|
@ -73,7 +73,7 @@ module Muse::Dl
|
||||||
FileUtils.rm source if parser.cleanup
|
FileUtils.rm source if parser.cleanup
|
||||||
elsif thing.is_a? Muse::Dl::Issue
|
elsif thing.is_a? Muse::Dl::Issue
|
||||||
# Will have no effect if parser has a custom title
|
# Will have no effect if parser has a custom title
|
||||||
parser.output = Util.slug_filename "#{thing.title}.pdf"
|
parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"
|
||||||
|
|
||||||
# If file exists and we can't clobber
|
# If file exists and we can't clobber
|
||||||
if File.exists?(parser.output) && parser.clobber == false
|
if File.exists?(parser.output) && parser.clobber == false
|
||||||
|
@ -84,30 +84,32 @@ module Muse::Dl
|
||||||
pdf_builder = Pdftk.new(parser.tmp)
|
pdf_builder = Pdftk.new(parser.tmp)
|
||||||
|
|
||||||
# ## TODO till 111
|
# ## TODO till 111
|
||||||
thing.issues.each do |issue|
|
thing.articles.each do |article|
|
||||||
begin
|
begin
|
||||||
Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first)
|
Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
|
||||||
rescue e : Muse::Dl::Errors::MuseCorruptPDF
|
rescue e : Muse::Dl::Errors::MuseCorruptPDF
|
||||||
STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
|
STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
chapter_ids = thing.chapters.map { |c| c[0] }
|
article_ids = thing.articles.map { |a| a.id }
|
||||||
|
|
||||||
# Stitch the PDFs together
|
# Stitch the PDFs together
|
||||||
temp_stitched_file = pdf_builder.stitch chapter_ids
|
temp_stitched_file = pdf_builder.stitch_articles article_ids
|
||||||
|
# TODO: Add metadata for each Issue
|
||||||
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
||||||
|
|
||||||
temp_stitched_file.delete if temp_stitched_file
|
# temp_stitched_file.delete if temp_stitched_file
|
||||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
|
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
|
||||||
puts "DL: #{url}. Saved final output to #{parser.output}"
|
puts "DL: #{url}. Saved final output to #{parser.output}"
|
||||||
|
|
||||||
# Cleanup the chapter files
|
# Cleanup the chapter files
|
||||||
if parser.cleanup
|
# TODO
|
||||||
thing.chapters.each do |c|
|
# if parser.cleanup
|
||||||
Fetch.cleanup(parser.tmp, c[0])
|
# thing.articles.each do |c|
|
||||||
end
|
# Fetch.cleanup(parser.tmp, c[0])
|
||||||
end
|
# end
|
||||||
|
# end
|
||||||
####
|
####
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
86
src/pdftk.cr
86
src/pdftk.cr
|
@ -70,7 +70,6 @@ module Muse::Dl
|
||||||
|
|
||||||
def add_metadata(input_file : File, output_file : String, book : Book)
|
def add_metadata(input_file : File, output_file : String, book : Book)
|
||||||
# First we have to dump the current metadata
|
# First we have to dump the current metadata
|
||||||
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
|
||||||
keywords = "Publisher:#{book.publisher}, Published:#{book.date}"
|
keywords = "Publisher:#{book.publisher}, Published:#{book.date}"
|
||||||
|
|
||||||
# Known Info keys, if they are present
|
# Known Info keys, if they are present
|
||||||
|
@ -80,7 +79,12 @@ module Muse::Dl
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
text = <<-EOT
|
metadata_text = gen_metadata(book.title, keywords, book.summary.gsub(/\n\s+/, " "), book.author)
|
||||||
|
write_metadata(input_file, output_file, metadata_text)
|
||||||
|
end
|
||||||
|
|
||||||
|
def gen_metadata(title : String, keywords : String, subject : String, author : String | Nil = nil)
|
||||||
|
metadata = <<-EOT
|
||||||
InfoBegin
|
InfoBegin
|
||||||
InfoKey: Creator
|
InfoKey: Creator
|
||||||
InfoValue:
|
InfoValue:
|
||||||
|
@ -89,25 +93,37 @@ module Muse::Dl
|
||||||
InfoValue:
|
InfoValue:
|
||||||
InfoBegin
|
InfoBegin
|
||||||
InfoKey: Title
|
InfoKey: Title
|
||||||
InfoValue: #{book.title}
|
InfoValue: #{title}
|
||||||
InfoBegin
|
InfoBegin
|
||||||
InfoKey: Keywords
|
InfoKey: Keywords
|
||||||
InfoValue: #{keywords}
|
InfoValue: #{keywords}
|
||||||
InfoBegin
|
InfoBegin
|
||||||
InfoKey: Author
|
|
||||||
InfoValue: #{book.author}
|
|
||||||
InfoBegin
|
|
||||||
InfoKey: Subject
|
InfoKey: Subject
|
||||||
InfoValue: #{book.summary.gsub(/\n\s+/, " ")}
|
InfoValue: #{subject}
|
||||||
InfoBegin
|
InfoBegin
|
||||||
InfoKey: ModDate
|
InfoKey: ModDate
|
||||||
InfoValue:
|
InfoValue:
|
||||||
InfoBegin
|
InfoBegin
|
||||||
InfoKey: CreationDate
|
InfoKey: CreationDate
|
||||||
InfoValue:
|
InfoValue:
|
||||||
|
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
|
unless author.nil?
|
||||||
|
metadata += <<-EOT
|
||||||
|
InfoBegin
|
||||||
|
InfoKey: Author
|
||||||
|
InfoValue: #{author}
|
||||||
|
EOT
|
||||||
|
end
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
end
|
||||||
|
|
||||||
|
def write_metadata(input_file : File, output_file : String, text)
|
||||||
|
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
||||||
File.write(metadata_text_file.path, text)
|
File.write(metadata_text_file.path, text)
|
||||||
|
|
||||||
is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
|
is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
|
||||||
if !is_success
|
if !is_success
|
||||||
raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.")
|
raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.")
|
||||||
|
@ -115,11 +131,42 @@ module Muse::Dl
|
||||||
metadata_text_file.delete
|
metadata_text_file.delete
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def add_metadata(input_file : File, output_file : String, issue : Issue)
|
||||||
|
# First we have to dump the current metadata
|
||||||
|
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
||||||
|
keywords = "Journal:#{issue.journal_title}, Published:#{issue.date},Volume:#{issue.volume},Number:#{issue.number}"
|
||||||
|
["ISSN", "Print ISSN", "DOI", "Language", "Open Access"].each do |label|
|
||||||
|
if issue.info.has_key? label
|
||||||
|
keywords += ", #{label}:#{issue.info[label]}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# TODO: Move this to Issue class
|
||||||
|
|
||||||
|
s = issue.summary
|
||||||
|
unless s.nil?
|
||||||
|
summary = s.gsub(/\n\s+/, " ")
|
||||||
|
else
|
||||||
|
summary = "NA"
|
||||||
|
end
|
||||||
|
|
||||||
|
t = issue.title
|
||||||
|
|
||||||
|
unless t.nil?
|
||||||
|
title = t
|
||||||
|
else
|
||||||
|
title = "NA"
|
||||||
|
end
|
||||||
|
# TODO: Add support for all authors in the PDF
|
||||||
|
metadata = gen_metadata(title, keywords, summary)
|
||||||
|
write_metadata(input_file, output_file, metadata)
|
||||||
|
end
|
||||||
|
|
||||||
def stitch(chapter_ids : Array(String))
|
def stitch(chapter_ids : Array(String))
|
||||||
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
|
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
|
||||||
# Do some sanity checks on each Chapter PDF
|
# Do some sanity checks on each Chapter PDF
|
||||||
chapter_ids.each do |id|
|
chapter_ids.each do |id|
|
||||||
raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
|
raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
|
||||||
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
|
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -136,5 +183,28 @@ module Muse::Dl
|
||||||
|
|
||||||
return output_file
|
return output_file
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# TODO: Merge with stitch
|
||||||
|
def stitch_articles(article_ids : Array(String))
|
||||||
|
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
|
||||||
|
# Do some sanity checks on each Chapter PDF
|
||||||
|
article_ids.each do |id|
|
||||||
|
raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.article_file_name(id, @tmp_file_path)
|
||||||
|
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.article_file_name(id, @tmp_file_path)) > 0
|
||||||
|
end
|
||||||
|
|
||||||
|
# Now let's stitch them together
|
||||||
|
article_files = article_ids.map { |id| Fetch.article_file_name(id, @tmp_file_path) }
|
||||||
|
args = article_files + ["cat", "output", output_file.path]
|
||||||
|
is_success = execute args
|
||||||
|
|
||||||
|
# TODO: Validate final file here
|
||||||
|
if !is_success
|
||||||
|
puts args
|
||||||
|
raise Muse::Dl::Errors::PDFOperationError.new("Error stitching articles together.")
|
||||||
|
end
|
||||||
|
|
||||||
|
return output_file
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue