From 483f838d24b9dc9df5415f748b21af8049b64e1c Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 01:58:20 -0400 Subject: [PATCH 1/9] Report pdftk and download errors. Add exponential backoff to downloading after download failures. Add top-level rescue block to improve forward progress. --- src/fetch.cr | 12 ++++++++++-- src/muse-dl.cr | 26 +++++++++++++++++++++----- src/pdftk.cr | 32 +++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/fetch.cr b/src/fetch.cr index 58b69a0..dd80dee 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -43,7 +43,10 @@ module Muse::Dl # TODO: Add validation for the downloaded file (should be PDF) Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response| - # puts response.headers["Content-Type"] + if !response.success? + raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}") + end + content_type = response.headers["Content-Type"] if content_type.is_a? String if /html/.match content_type @@ -59,7 +62,12 @@ module Muse::Dl end end File.open(tmp_pdf_file, "w") do |file| - IO.copy(response.body_io, file) + response_str = response.body + file << response_str + if file.size == 0 + # puts response.headers + raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response size was #{response_str.bytesize}") + end end end diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 5ae80e5..fcce75d 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -65,14 +65,30 @@ module Muse::Dl def self.run(args : Array(String)) parser = Parser.new(args) + delay_secs = 1 input_list = parser.input_list if input_list File.each_line input_list do |url| - # TODO: Change this to nil - parser.reset_output_file - parser.url = url.strip - # Ask the download process to not quit the process, and return instead - Main.dl parser + begin + # TODO: Change this to nil + parser.reset_output_file + parser.url = url.strip + # Ask the download process to not quit the process, and return instead + Main.dl parser + if delay_secs >= 2 + delay_secs /= 2 + end + rescue ex : Muse::Dl::Errors::DownloadError + puts ex + puts "Download error. Skipping book: #{url}. Waiting for #{delay_secs} seconds before continuing." + # Sleep to prevent hammering the server. + sleep(delay_secs) + delay_secs *= 2 + rescue ex + puts ex + puts "Non-download error. Skipping book: #{url}." + sleep(1) + end end elsif parser.url Main.dl parser diff --git a/src/pdftk.cr b/src/pdftk.cr index d62c500..6833b5f 100644 --- a/src/pdftk.cr +++ b/src/pdftk.cr @@ -28,14 +28,22 @@ module Muse::Dl def execute(args : Array(String)) binary = @binary if binary - Process.run(binary, args) + status = Process.run(binary, args, output: STDOUT, error: STDERR) + if !status.success? + puts "pdftk command failed: #{binary} #{args.join(" ")}" + end + return status.success? end end def strip_first_page(input_file : String) output_pdf = File.tempfile("muse-dl-temp", ".pdf") - execute [input_file, "cat", "2-end", "output", output_pdf.path] - File.rename output_pdf.path, input_file + is_success = execute [input_file, "cat", "2-end", "output", output_pdf.path] + if is_success + File.rename output_pdf.path, input_file + else + raise Muse::Dl::Errors::PDFOperationError.new("Error stripping first page of chapter.") + end end def add_bookmark(input_file : String, title : String) @@ -48,11 +56,15 @@ module Muse::Dl BookmarkPageNumber: 1 END File.write(bookmark_text_file.path, bookmark_text) - execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path] + is_success = execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path] # Cleanup bookmark_text_file.delete - File.rename output_pdf.path, input_file + if is_success + File.rename output_pdf.path, input_file + else + raise Muse::Dl::Errors::PDFOperationError.new("Error adding bookmark metadata to chapter.") + end end def add_metadata(input_file : File, output_file : String, book : Book) @@ -95,7 +107,10 @@ module Muse::Dl EOT File.write(metadata_text_file.path, text) - execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file] + is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file] + if !is_success + raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.") + end metadata_text_file.delete end @@ -111,9 +126,12 @@ module Muse::Dl chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) } args = chapter_files + ["cat", "output", output_file.path] - execute args + is_success = execute args # TODO: Validate final file here + if !is_success + raise Muse::Dl::Errors::PDFOperationError.new("Error stitching chapters together.") + end return output_file end From 5d9d951c9afa366e09fea2484cb760f959b7eb76 Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 02:24:09 -0400 Subject: [PATCH 2/9] Write backtrace in top-level rescue blocks. --- src/muse-dl.cr | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/muse-dl.cr b/src/muse-dl.cr index fcce75d..ad9f816 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -79,13 +79,14 @@ module Muse::Dl delay_secs /= 2 end rescue ex : Muse::Dl::Errors::DownloadError - puts ex + puts ex.backtrace.join("\n ") puts "Download error. Skipping book: #{url}. Waiting for #{delay_secs} seconds before continuing." # Sleep to prevent hammering the server. sleep(delay_secs) delay_secs *= 2 rescue ex - puts ex + puts ex.backtrace.join("\n ") + pp ex puts "Non-download error. Skipping book: #{url}." sleep(1) end From 26d96d3f7d2432fee36b1ea08f0a9c2dcea62e8e Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 02:40:42 -0400 Subject: [PATCH 3/9] Remove assert that temp path be tmp. It has been changed to an actual random temp path so we can't test for it easily. --- spec/parser_spec.cr | 1 - 1 file changed, 1 deletion(-) diff --git a/spec/parser_spec.cr b/spec/parser_spec.cr index 3a09138..5bc4942 100644 --- a/spec/parser_spec.cr +++ b/spec/parser_spec.cr @@ -13,7 +13,6 @@ describe Muse::Dl::Parser do parser = Muse::Dl::Parser.new(["https://muse.jhu.edu/book/68534"]) parser.bookmarks.should eq true parser.cleanup.should eq true - parser.tmp.should eq "/tmp" parser.output.should eq "tempfilename.pdf" parser.url.should eq "https://muse.jhu.edu/book/68534" end From 1d2f53bad04ca1bc14a49795eb33d1e3148526af Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 14:46:26 -0400 Subject: [PATCH 4/9] forgot to git-add new error files --- src/errors/download_error.cr | 4 ++++ src/errors/pdf_operation_error.cr | 4 ++++ 2 files changed, 8 insertions(+) create mode 100644 src/errors/download_error.cr create mode 100644 src/errors/pdf_operation_error.cr diff --git a/src/errors/download_error.cr b/src/errors/download_error.cr new file mode 100644 index 0000000..1208e96 --- /dev/null +++ b/src/errors/download_error.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class DownloadError < Exception + end +end diff --git a/src/errors/pdf_operation_error.cr b/src/errors/pdf_operation_error.cr new file mode 100644 index 0000000..6810fd4 --- /dev/null +++ b/src/errors/pdf_operation_error.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class PDFOperationError < Exception + end +end From db2d86c1a84e61e3abd6451cfe1c985bfeb5b708 Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 14:49:41 -0400 Subject: [PATCH 5/9] Also add exception message to top-level rescue --- src/muse-dl.cr | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/muse-dl.cr b/src/muse-dl.cr index ad9f816..6723c59 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -79,12 +79,14 @@ module Muse::Dl delay_secs /= 2 end rescue ex : Muse::Dl::Errors::DownloadError + puts ex.message puts ex.backtrace.join("\n ") puts "Download error. Skipping book: #{url}. Waiting for #{delay_secs} seconds before continuing." # Sleep to prevent hammering the server. sleep(delay_secs) delay_secs *= 2 rescue ex + puts ex.message puts ex.backtrace.join("\n ") pp ex puts "Non-download error. Skipping book: #{url}." From 77201bda85e7c344f2721cf03746cadf378a4ec1 Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 15:00:59 -0400 Subject: [PATCH 6/9] Fix download issue - revert to using body_io --- src/fetch.cr | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/fetch.cr b/src/fetch.cr index dd80dee..6d970f4 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -62,11 +62,9 @@ module Muse::Dl end end File.open(tmp_pdf_file, "w") do |file| - response_str = response.body - file << response_str + IO.copy(response.body_io, file) if file.size == 0 - # puts response.headers - raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response size was #{response_str.bytesize}") + raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}") end end end From 762164e22377cafe97050cef2ece226ee80e0019 Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 15:18:05 -0400 Subject: [PATCH 7/9] more descriptive error messages --- src/fetch.cr | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fetch.cr b/src/fetch.cr index 6d970f4..aefe540 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -56,7 +56,7 @@ module Muse::Dl # https://muse.jhu.edu/book/67393 # Errors are Unable to determine page runs / Unable to construct chapter PDF if /Unable to/.match line - raise Muse::Dl::Errors::MuseCorruptPDF.new + raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}") end end end @@ -95,10 +95,10 @@ module Muse::Dl return Muse::Dl::Journal.new response end rescue ex : Crest::NotFound - raise Muse::Dl::Errors::InvalidLink.new + raise Muse::Dl::Errors::InvalidLink.new("Error - could not download url: #{url}") end else - raise Muse::Dl::Errors::InvalidLink.new + raise Muse::Dl::Errors::InvalidLink.new("Error - url does not match expected pattern: #{url}") end end end From 4e435dd3abffd1b140f29eae454f9bb5bbf014ec Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 17:27:54 -0400 Subject: [PATCH 8/9] Add 60s timeout to downloads. Do backoff for all errors. --- src/fetch.cr | 56 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/src/fetch.cr b/src/fetch.cr index aefe540..1bd557e 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -4,7 +4,8 @@ require "myhtml" module Muse::Dl class Fetch - USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" + USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" + DOWNLOAD_TIMEOUT_SECS = 60 HEADERS = { "User-Agent" => USER_AGENT, @@ -33,6 +34,11 @@ module Muse::Dl # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" + uri = URI.parse(url) + http_client = HTTP::Client.new(uri) + # Raise a IO::TimeoutError after 60 seconds. + http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS + headers = HEADERS.merge({ "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", }) @@ -41,31 +47,37 @@ module Muse::Dl headers["Cookie"] = cookie end - # TODO: Add validation for the downloaded file (should be PDF) - Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response| - if !response.success? - raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}") - end + request = Crest::Request.new(:get, url, headers: headers, max_redirects: 0, handle_errors: false) - content_type = response.headers["Content-Type"] - if content_type.is_a? String - if /html/.match content_type - puts response - response.body_io.each_line do |line| - # https://muse.jhu.edu/chapter/2383438/pdf - # https://muse.jhu.edu/book/67393 - # Errors are Unable to determine page runs / Unable to construct chapter PDF - if /Unable to/.match line - raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}") - end + begin + response = request.execute + rescue ex : IO::TimeoutError + raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") + end + + # TODO: Add validation for the downloaded file (should be PDF) + if !response.success? + raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}") + end + + content_type = response.headers["Content-Type"] + if content_type.is_a? String + if /html/.match content_type + puts response + response.body_io.each_line do |line| + # https://muse.jhu.edu/chapter/2383438/pdf + # https://muse.jhu.edu/book/67393 + # Errors are Unable to determine page runs / Unable to construct chapter PDF + if /Unable to/.match line + raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}") end end end - File.open(tmp_pdf_file, "w") do |file| - IO.copy(response.body_io, file) - if file.size == 0 - raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}") - end + end + File.open(tmp_pdf_file, "w") do |file| + IO.copy(response.body_io, file) + if file.size == 0 + raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}") end end From 2206c41228cb26a6800c4ac7c2f5d8efa41f6774 Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 17:50:06 -0400 Subject: [PATCH 9/9] Use response.body, not response.body_io, which is nil when you pass in HTTPClient for some reason. --- src/fetch.cr | 4 ++-- src/muse-dl.cr | 13 +++---------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/fetch.cr b/src/fetch.cr index 1bd557e..696b113 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -64,7 +64,7 @@ module Muse::Dl if content_type.is_a? String if /html/.match content_type puts response - response.body_io.each_line do |line| + response.body.each_line do |line| # https://muse.jhu.edu/chapter/2383438/pdf # https://muse.jhu.edu/book/67393 # Errors are Unable to determine page runs / Unable to construct chapter PDF @@ -75,7 +75,7 @@ module Muse::Dl end end File.open(tmp_pdf_file, "w") do |file| - IO.copy(response.body_io, file) + file << response.body if file.size == 0 raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}") end diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 6723c59..c5fa70e 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -78,19 +78,12 @@ module Muse::Dl if delay_secs >= 2 delay_secs /= 2 end - rescue ex : Muse::Dl::Errors::DownloadError - puts ex.message - puts ex.backtrace.join("\n ") - puts "Download error. Skipping book: #{url}. Waiting for #{delay_secs} seconds before continuing." - # Sleep to prevent hammering the server. - sleep(delay_secs) - delay_secs *= 2 rescue ex puts ex.message puts ex.backtrace.join("\n ") - pp ex - puts "Non-download error. Skipping book: #{url}." - sleep(1) + puts "Error. Skipping book: #{url}. Waiting for #{delay_secs} seconds before continuing." + sleep(delay_secs) + delay_secs *= 2 end end elsif parser.url