diff --git a/src/errors/missing_chapter.cr b/src/errors/missing_chapter.cr deleted file mode 100644 index b33487f..0000000 --- a/src/errors/missing_chapter.cr +++ /dev/null @@ -1,4 +0,0 @@ -module Muse::Dl::Errors - class MissingChapter < Exception - end -end diff --git a/src/errors/missing_file.cr b/src/errors/missing_file.cr new file mode 100644 index 0000000..f11f6fd --- /dev/null +++ b/src/errors/missing_file.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class MissingFile < Exception + end +end diff --git a/src/fetch.cr b/src/fetch.cr index ef15d29..963c894 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -132,7 +132,7 @@ module Muse::Dl when "journal" return Muse::Dl::Journal.new response when "issue" - return Muse::Dl::Issue.new response + return Muse::Dl::Issue.new match[2], response when "article" return Muse::Dl::Article.new match[2] end diff --git a/src/infoparser.cr b/src/infoparser.cr index 836c9e3..8f10d28 100644 --- a/src/infoparser.cr +++ b/src/infoparser.cr @@ -35,7 +35,11 @@ module Muse::Dl end def self.issue_title(myhtml : Myhtml::Parser) - myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip + begin + myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip + rescue + nil + end end def self.author(myhtml : Myhtml::Parser) diff --git a/src/issue.cr b/src/issue.cr index 4a0c97a..22c8dae 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -16,15 +16,15 @@ module Muse::Dl date : String | Nil, journal_title : String | Nil - def initialize(id : String) + def initialize(id : String, response : String | Nil = nil) @id = id @url = "https://muse.jhu.edu/issue/#{id}" - @info = Hash(String, String).new @articles = [] of Muse::Dl::Article + parse(response) if response + @info = Hash(String, String).new end - def parse - html = Crest.get(url).to_s + def parse(html : String) h = Myhtml::Parser.new html @info = InfoParser.infobox(h) @title = InfoParser.issue_title(h) diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 656dab1..07e6f25 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -47,7 +47,7 @@ module Muse::Dl pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) temp_stitched_file.delete if temp_stitched_file - puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first + puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first puts "DL: #{url}. Saved final output to #{parser.output}" # Cleanup the chapter files @@ -73,7 +73,7 @@ module Muse::Dl FileUtils.rm source if parser.cleanup elsif thing.is_a? Muse::Dl::Issue # Will have no effect if parser has a custom title - parser.output = Util.slug_filename "#{thing.title}.pdf" + parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf" # If file exists and we can't clobber if File.exists?(parser.output) && parser.clobber == false @@ -84,30 +84,32 @@ module Muse::Dl pdf_builder = Pdftk.new(parser.tmp) # ## TODO till 111 - thing.issues.each do |issue| + thing.articles.each do |article| begin - Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first) + Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first) rescue e : Muse::Dl::Errors::MuseCorruptPDF STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" return end end - chapter_ids = thing.chapters.map { |c| c[0] } + article_ids = thing.articles.map { |a| a.id } # Stitch the PDFs together - temp_stitched_file = pdf_builder.stitch chapter_ids + temp_stitched_file = pdf_builder.stitch_articles article_ids + # TODO: Add metadata for each Issue pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) - temp_stitched_file.delete if temp_stitched_file - puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first + # temp_stitched_file.delete if temp_stitched_file + puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first puts "DL: #{url}. Saved final output to #{parser.output}" # Cleanup the chapter files - if parser.cleanup - thing.chapters.each do |c| - Fetch.cleanup(parser.tmp, c[0]) - end - end + # TODO + # if parser.cleanup + # thing.articles.each do |c| + # Fetch.cleanup(parser.tmp, c[0]) + # end + # end #### end end diff --git a/src/pdftk.cr b/src/pdftk.cr index 4b5a01c..41ebcf0 100644 --- a/src/pdftk.cr +++ b/src/pdftk.cr @@ -70,7 +70,6 @@ module Muse::Dl def add_metadata(input_file : File, output_file : String, book : Book) # First we have to dump the current metadata - metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") keywords = "Publisher:#{book.publisher}, Published:#{book.date}" # Known Info keys, if they are present @@ -80,7 +79,12 @@ module Muse::Dl end end - text = <<-EOT + metadata_text = gen_metadata(book.title, keywords, book.summary.gsub(/\n\s+/, " "), book.author) + write_metadata(input_file, output_file, metadata_text) + end + + def gen_metadata(title : String, keywords : String, subject : String, author : String | Nil = nil) + metadata = <<-EOT InfoBegin InfoKey: Creator InfoValue: @@ -89,25 +93,37 @@ module Muse::Dl InfoValue: InfoBegin InfoKey: Title - InfoValue: #{book.title} + InfoValue: #{title} InfoBegin InfoKey: Keywords InfoValue: #{keywords} InfoBegin - InfoKey: Author - InfoValue: #{book.author} - InfoBegin InfoKey: Subject - InfoValue: #{book.summary.gsub(/\n\s+/, " ")} + InfoValue: #{subject} InfoBegin InfoKey: ModDate InfoValue: InfoBegin InfoKey: CreationDate InfoValue: + EOT + unless author.nil? + metadata += <<-EOT + InfoBegin + InfoKey: Author + InfoValue: #{author} + EOT + end + + return metadata + end + + def write_metadata(input_file : File, output_file : String, text) + metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") File.write(metadata_text_file.path, text) + is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file] if !is_success raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.") @@ -115,11 +131,42 @@ module Muse::Dl metadata_text_file.delete end + def add_metadata(input_file : File, output_file : String, issue : Issue) + # First we have to dump the current metadata + metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") + keywords = "Journal:#{issue.journal_title}, Published:#{issue.date},Volume:#{issue.volume},Number:#{issue.number}" + ["ISSN", "Print ISSN", "DOI", "Language", "Open Access"].each do |label| + if issue.info.has_key? label + keywords += ", #{label}:#{issue.info[label]}" + end + end + + # TODO: Move this to Issue class + + s = issue.summary + unless s.nil? + summary = s.gsub(/\n\s+/, " ") + else + summary = "NA" + end + + t = issue.title + + unless t.nil? + title = t + else + title = "NA" + end + # TODO: Add support for all authors in the PDF + metadata = gen_metadata(title, keywords, summary) + write_metadata(input_file, output_file, metadata) + end + def stitch(chapter_ids : Array(String)) output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf") # Do some sanity checks on each Chapter PDF chapter_ids.each do |id| - raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path) + raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path) raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0 end @@ -136,5 +183,28 @@ module Muse::Dl return output_file end + + # TODO: Merge with stitch + def stitch_articles(article_ids : Array(String)) + output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf") + # Do some sanity checks on each Chapter PDF + article_ids.each do |id| + raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.article_file_name(id, @tmp_file_path) + raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.article_file_name(id, @tmp_file_path)) > 0 + end + + # Now let's stitch them together + article_files = article_ids.map { |id| Fetch.article_file_name(id, @tmp_file_path) } + args = article_files + ["cat", "output", output_file.path] + is_success = execute args + + # TODO: Validate final file here + if !is_success + puts args + raise Muse::Dl::Errors::PDFOperationError.new("Error stitching articles together.") + end + + return output_file + end end end