Merge pull request #12 from pradn/better-errors

Improve error handling
This commit is contained in:
Nemo 2020-04-20 03:23:24 +05:30 committed by GitHub
commit ebf1b57e22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 86 additions and 31 deletions

View File

@ -13,7 +13,6 @@ describe Muse::Dl::Parser do
parser = Muse::Dl::Parser.new(["https://muse.jhu.edu/book/68534"]) parser = Muse::Dl::Parser.new(["https://muse.jhu.edu/book/68534"])
parser.bookmarks.should eq true parser.bookmarks.should eq true
parser.cleanup.should eq true parser.cleanup.should eq true
parser.tmp.should eq "/tmp"
parser.output.should eq "tempfilename.pdf" parser.output.should eq "tempfilename.pdf"
parser.url.should eq "https://muse.jhu.edu/book/68534" parser.url.should eq "https://muse.jhu.edu/book/68534"
end end

View File

@ -0,0 +1,4 @@
module Muse::Dl::Errors
class DownloadError < Exception
end
end

View File

@ -0,0 +1,4 @@
module Muse::Dl::Errors
class PDFOperationError < Exception
end
end

View File

@ -4,7 +4,8 @@ require "myhtml"
module Muse::Dl module Muse::Dl
class Fetch class Fetch
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
DOWNLOAD_TIMEOUT_SECS = 60
HEADERS = { HEADERS = {
"User-Agent" => USER_AGENT, "User-Agent" => USER_AGENT,
@ -33,6 +34,11 @@ module Muse::Dl
# TODO: Remove this hardcoding, and make this more generic by generating it within the Book class # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class
url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
uri = URI.parse(url)
http_client = HTTP::Client.new(uri)
# Raise a IO::TimeoutError after 60 seconds.
http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS
headers = HEADERS.merge({ headers = HEADERS.merge({
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
}) })
@ -41,25 +47,37 @@ module Muse::Dl
headers["Cookie"] = cookie headers["Cookie"] = cookie
end end
request = Crest::Request.new(:get, url, headers: headers, max_redirects: 0, handle_errors: false)
begin
response = request.execute
rescue ex : IO::TimeoutError
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.")
end
# TODO: Add validation for the downloaded file (should be PDF) # TODO: Add validation for the downloaded file (should be PDF)
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response| if !response.success?
# puts response.headers["Content-Type"] raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}")
content_type = response.headers["Content-Type"] end
if content_type.is_a? String
if /html/.match content_type content_type = response.headers["Content-Type"]
puts response if content_type.is_a? String
response.body_io.each_line do |line| if /html/.match content_type
# https://muse.jhu.edu/chapter/2383438/pdf puts response
# https://muse.jhu.edu/book/67393 response.body.each_line do |line|
# Errors are Unable to determine page runs / Unable to construct chapter PDF # https://muse.jhu.edu/chapter/2383438/pdf
if /Unable to/.match line # https://muse.jhu.edu/book/67393
raise Muse::Dl::Errors::MuseCorruptPDF.new # Errors are Unable to determine page runs / Unable to construct chapter PDF
end if /Unable to/.match line
raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}")
end end
end end
end end
File.open(tmp_pdf_file, "w") do |file| end
IO.copy(response.body_io, file) File.open(tmp_pdf_file, "w") do |file|
file << response.body
if file.size == 0
raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}")
end end
end end
@ -89,10 +107,10 @@ module Muse::Dl
return Muse::Dl::Journal.new response return Muse::Dl::Journal.new response
end end
rescue ex : Crest::NotFound rescue ex : Crest::NotFound
raise Muse::Dl::Errors::InvalidLink.new raise Muse::Dl::Errors::InvalidLink.new("Error - could not download url: #{url}")
end end
else else
raise Muse::Dl::Errors::InvalidLink.new raise Muse::Dl::Errors::InvalidLink.new("Error - url does not match expected pattern: #{url}")
end end
end end
end end

View File

@ -65,14 +65,26 @@ module Muse::Dl
def self.run(args : Array(String)) def self.run(args : Array(String))
parser = Parser.new(args) parser = Parser.new(args)
delay_secs = 1
input_list = parser.input_list input_list = parser.input_list
if input_list if input_list
File.each_line input_list do |url| File.each_line input_list do |url|
# TODO: Change this to nil begin
parser.reset_output_file # TODO: Change this to nil
parser.url = url.strip parser.reset_output_file
# Ask the download process to not quit the process, and return instead parser.url = url.strip
Main.dl parser # Ask the download process to not quit the process, and return instead
Main.dl parser
if delay_secs >= 2
delay_secs /= 2
end
rescue ex
puts ex.message
puts ex.backtrace.join("\n ")
puts "Error. Skipping book: #{url}. Waiting for #{delay_secs} seconds before continuing."
sleep(delay_secs)
delay_secs *= 2
end
end end
elsif parser.url elsif parser.url
Main.dl parser Main.dl parser

View File

@ -28,14 +28,22 @@ module Muse::Dl
def execute(args : Array(String)) def execute(args : Array(String))
binary = @binary binary = @binary
if binary if binary
Process.run(binary, args) status = Process.run(binary, args, output: STDOUT, error: STDERR)
if !status.success?
puts "pdftk command failed: #{binary} #{args.join(" ")}"
end
return status.success?
end end
end end
def strip_first_page(input_file : String) def strip_first_page(input_file : String)
output_pdf = File.tempfile("muse-dl-temp", ".pdf") output_pdf = File.tempfile("muse-dl-temp", ".pdf")
execute [input_file, "cat", "2-end", "output", output_pdf.path] is_success = execute [input_file, "cat", "2-end", "output", output_pdf.path]
File.rename output_pdf.path, input_file if is_success
File.rename output_pdf.path, input_file
else
raise Muse::Dl::Errors::PDFOperationError.new("Error stripping first page of chapter.")
end
end end
def add_bookmark(input_file : String, title : String) def add_bookmark(input_file : String, title : String)
@ -48,11 +56,15 @@ module Muse::Dl
BookmarkPageNumber: 1 BookmarkPageNumber: 1
END END
File.write(bookmark_text_file.path, bookmark_text) File.write(bookmark_text_file.path, bookmark_text)
execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path] is_success = execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path]
# Cleanup # Cleanup
bookmark_text_file.delete bookmark_text_file.delete
File.rename output_pdf.path, input_file if is_success
File.rename output_pdf.path, input_file
else
raise Muse::Dl::Errors::PDFOperationError.new("Error adding bookmark metadata to chapter.")
end
end end
def add_metadata(input_file : File, output_file : String, book : Book) def add_metadata(input_file : File, output_file : String, book : Book)
@ -95,7 +107,10 @@ module Muse::Dl
EOT EOT
File.write(metadata_text_file.path, text) File.write(metadata_text_file.path, text)
execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file] is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
if !is_success
raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.")
end
metadata_text_file.delete metadata_text_file.delete
end end
@ -111,9 +126,12 @@ module Muse::Dl
chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) } chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) }
args = chapter_files + ["cat", "output", output_file.path] args = chapter_files + ["cat", "output", output_file.path]
execute args is_success = execute args
# TODO: Validate final file here # TODO: Validate final file here
if !is_success
raise Muse::Dl::Errors::PDFOperationError.new("Error stitching chapters together.")
end
return output_file return output_file
end end