diff --git a/src/article.cr b/src/article.cr index 8da80b6..2d6e6a4 100644 --- a/src/article.cr +++ b/src/article.cr @@ -4,6 +4,7 @@ require "./issue.cr" module Muse::Dl class Article @id : String + getter :id def initialize(id : String) @id = id diff --git a/src/fetch.cr b/src/fetch.cr index acb96ab..ef15d29 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -14,6 +14,10 @@ module Muse::Dl "Connection" => "keep-alive", } + def self.article_file_name(id : String, tmp_path : String) + "#{tmp_path}/article-#{id}.pdf" + end + def self.chapter_file_name(id : String, tmp_path : String) "#{tmp_path}/chapter-#{id}.pdf" end @@ -23,24 +27,20 @@ module Muse::Dl File.delete(fns) if File.exists?(fns) end - def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, cookie : String | Nil = nil, add_bookmark = true, strip_first_page = true) - final_pdf_file = chapter_file_name chapter_id, tmp_path - tmp_pdf_file = "#{final_pdf_file}.tmp" - - if File.exists? final_pdf_file - puts "#{chapter_id} already downloaded" + def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true) + tmp_pdf_file = "#{file_name}.tmp" + if File.exists? file_name + puts "#{file_name} already downloaded" return end - # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class - url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" uri = URI.parse(url) http_client = HTTP::Client.new(uri) # Raise a IO::TimeoutError after 60 seconds. http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS headers = HEADERS.merge({ - "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", + "Referer" => referer, }) if cookie @@ -52,7 +52,7 @@ module Muse::Dl begin response = request.execute rescue ex : IO::TimeoutError - raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") + raise Muse::Dl::Errors::DownloadError.new("Error downloading #{url}. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") end # TODO: Add validation for the downloaded file (should be PDF) @@ -76,6 +76,7 @@ module Muse::Dl end end end + File.open(tmp_pdf_file, "w") do |file| file << response.body if file.size == 0 @@ -87,16 +88,39 @@ module Muse::Dl pdftk.strip_first_page tmp_pdf_file if strip_first_page - if add_bookmark + if bookmark_title # Run pdftk and add the bookmark to the file - pdftk.add_bookmark tmp_pdf_file, chapter_title.strip + pdftk.add_bookmark tmp_pdf_file, bookmark_title end # Now we can move the file to the proper PDF filename - File.rename tmp_pdf_file, final_pdf_file + File.rename tmp_pdf_file, file_name + end + + def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, cookie : String | Nil = nil, add_bookmark = true, strip_first_page = true) + final_pdf_file = chapter_file_name chapter_id, tmp_path + + if File.exists? final_pdf_file + puts "#{chapter_id} already downloaded" + return + end + + # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class + url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" + referer = "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf" + + save_url(url, referer, final_pdf_file, tmp_path, cookie, chapter_title, strip_first_page) + puts "Downloaded #{chapter_id}" end + def self.save_article(tmp_path : String, article_id : String, cookie : String | Nil = nil, article_title = nil, strip_first_page = true) + file_name = article_file_name article_id, tmp_path + url = "https://muse.jhu.edu/article/#{article_id}/pdf" + referer = "https://muse.jhu.edu/article/#{article_id}" + save_url(url, referer, file_name, tmp_path, cookie, article_title, strip_first_page) + end + def self.get_info(url : String) match = /https:\/\/muse.jhu.edu\/(book|journal|issue|article)\/(\d+)/.match url if match diff --git a/src/muse-dl.cr b/src/muse-dl.cr index a0534d8..26fd572 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -4,6 +4,7 @@ require "./fetch.cr" require "./book.cr" require "./journal.cr" require "./util.cr" +require "file_utils" module Muse::Dl VERSION = "1.1.2" @@ -56,7 +57,20 @@ module Muse::Dl end end elsif thing.is_a? Muse::Dl::Article - puts(thing) + # No bookmarks are needed since this is just a single article PDF + begin + Fetch.save_article(parser.tmp, thing.id, parser.cookie, nil, parser.strip_first) + rescue e : Muse::Dl::Errors::MuseCorruptPDF + STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" + return + end + + # TODO: Move this code elsewhere + source = Fetch.article_file_name(thing.id, parser.tmp) + destination = "article-#{thing.id}.pdf" + # Needed because of https://github.com/crystal-lang/crystal/issues/7777 + FileUtils.cp source, destination + FileUtils.rm source if parser.cleanup end end