diff --git a/spec/parser_spec.cr b/spec/parser_spec.cr index 3a09138..5bc4942 100644 --- a/spec/parser_spec.cr +++ b/spec/parser_spec.cr @@ -13,7 +13,6 @@ describe Muse::Dl::Parser do parser = Muse::Dl::Parser.new(["https://muse.jhu.edu/book/68534"]) parser.bookmarks.should eq true parser.cleanup.should eq true - parser.tmp.should eq "/tmp" parser.output.should eq "tempfilename.pdf" parser.url.should eq "https://muse.jhu.edu/book/68534" end diff --git a/src/errors/download_error.cr b/src/errors/download_error.cr new file mode 100644 index 0000000..1208e96 --- /dev/null +++ b/src/errors/download_error.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class DownloadError < Exception + end +end diff --git a/src/errors/pdf_operation_error.cr b/src/errors/pdf_operation_error.cr new file mode 100644 index 0000000..6810fd4 --- /dev/null +++ b/src/errors/pdf_operation_error.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class PDFOperationError < Exception + end +end diff --git a/src/fetch.cr b/src/fetch.cr index fea5fc9..c713b7c 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -4,7 +4,8 @@ require "myhtml" module Muse::Dl class Fetch - USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" + USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" + DOWNLOAD_TIMEOUT_SECS = 60 HEADERS = { "User-Agent" => USER_AGENT, @@ -33,6 +34,11 @@ module Muse::Dl # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" + uri = URI.parse(url) + http_client = HTTP::Client.new(uri) + # Raise a IO::TimeoutError after 60 seconds. + http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS + headers = HEADERS.merge({ "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", }) @@ -41,25 +47,37 @@ module Muse::Dl headers["Cookie"] = cookie end + request = Crest::Request.new(:get, url, headers: headers, max_redirects: 0, handle_errors: false) + + begin + response = request.execute + rescue ex : IO::TimeoutError + raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") + end + # TODO: Add validation for the downloaded file (should be PDF) - Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response| - # puts response.headers["Content-Type"] - content_type = response.headers["Content-Type"] - if content_type.is_a? String - if /html/.match content_type - puts response - response.body_io.each_line do |line| - # https://muse.jhu.edu/chapter/2383438/pdf - # https://muse.jhu.edu/book/67393 - # Errors are Unable to determine page runs / Unable to construct chapter PDF - if /Unable to/.match line - raise Muse::Dl::Errors::MuseCorruptPDF.new - end + if !response.success? + raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}") + end + + content_type = response.headers["Content-Type"] + if content_type.is_a? String + if /html/.match content_type + puts response + response.body.each_line do |line| + # https://muse.jhu.edu/chapter/2383438/pdf + # https://muse.jhu.edu/book/67393 + # Errors are Unable to determine page runs / Unable to construct chapter PDF + if /Unable to/.match line + raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}") end end end - File.open(tmp_pdf_file, "w") do |file| - IO.copy(response.body_io, file) + end + File.open(tmp_pdf_file, "w") do |file| + file << response.body + if file.size == 0 + raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}") end end @@ -89,10 +107,10 @@ module Muse::Dl return Muse::Dl::Journal.new response end rescue ex : Crest::NotFound - raise Muse::Dl::Errors::InvalidLink.new + raise Muse::Dl::Errors::InvalidLink.new("Error - could not download url: #{url}") end else - raise Muse::Dl::Errors::InvalidLink.new + raise Muse::Dl::Errors::InvalidLink.new("Error - url does not match expected pattern: #{url}") end end end diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 5ae80e5..c5fa70e 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -65,14 +65,26 @@ module Muse::Dl def self.run(args : Array(String)) parser = Parser.new(args) + delay_secs = 1 input_list = parser.input_list if input_list File.each_line input_list do |url| - # TODO: Change this to nil - parser.reset_output_file - parser.url = url.strip - # Ask the download process to not quit the process, and return instead - Main.dl parser + begin + # TODO: Change this to nil + parser.reset_output_file + parser.url = url.strip + # Ask the download process to not quit the process, and return instead + Main.dl parser + if delay_secs >= 2 + delay_secs /= 2 + end + rescue ex + puts ex.message + puts ex.backtrace.join("\n ") + puts "Error. Skipping book: #{url}. Waiting for #{delay_secs} seconds before continuing." + sleep(delay_secs) + delay_secs *= 2 + end end elsif parser.url Main.dl parser diff --git a/src/pdftk.cr b/src/pdftk.cr index d62c500..6833b5f 100644 --- a/src/pdftk.cr +++ b/src/pdftk.cr @@ -28,14 +28,22 @@ module Muse::Dl def execute(args : Array(String)) binary = @binary if binary - Process.run(binary, args) + status = Process.run(binary, args, output: STDOUT, error: STDERR) + if !status.success? + puts "pdftk command failed: #{binary} #{args.join(" ")}" + end + return status.success? end end def strip_first_page(input_file : String) output_pdf = File.tempfile("muse-dl-temp", ".pdf") - execute [input_file, "cat", "2-end", "output", output_pdf.path] - File.rename output_pdf.path, input_file + is_success = execute [input_file, "cat", "2-end", "output", output_pdf.path] + if is_success + File.rename output_pdf.path, input_file + else + raise Muse::Dl::Errors::PDFOperationError.new("Error stripping first page of chapter.") + end end def add_bookmark(input_file : String, title : String) @@ -48,11 +56,15 @@ module Muse::Dl BookmarkPageNumber: 1 END File.write(bookmark_text_file.path, bookmark_text) - execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path] + is_success = execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path] # Cleanup bookmark_text_file.delete - File.rename output_pdf.path, input_file + if is_success + File.rename output_pdf.path, input_file + else + raise Muse::Dl::Errors::PDFOperationError.new("Error adding bookmark metadata to chapter.") + end end def add_metadata(input_file : File, output_file : String, book : Book) @@ -95,7 +107,10 @@ module Muse::Dl EOT File.write(metadata_text_file.path, text) - execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file] + is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file] + if !is_success + raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.") + end metadata_text_file.delete end @@ -111,9 +126,12 @@ module Muse::Dl chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) } args = chapter_files + ["cat", "output", output_file.path] - execute args + is_success = execute args # TODO: Validate final file here + if !is_success + raise Muse::Dl::Errors::PDFOperationError.new("Error stitching chapters together.") + end return output_file end