Add 60s timeout to downloads. Do backoff for all errors.

This commit is contained in:
Prad Nelluru 2020-04-19 17:27:54 -04:00
parent 762164e223
commit 4e435dd3ab
1 changed files with 34 additions and 22 deletions

View File

@ -5,6 +5,7 @@ require "myhtml"
module Muse::Dl
class Fetch
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
DOWNLOAD_TIMEOUT_SECS = 60
HEADERS = {
"User-Agent" => USER_AGENT,
@ -33,6 +34,11 @@ module Muse::Dl
# TODO: Remove this hardcoding, and make this more generic by generating it within the Book class
url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
uri = URI.parse(url)
http_client = HTTP::Client.new(uri)
# Raise a IO::TimeoutError after 60 seconds.
http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS
headers = HEADERS.merge({
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
})
@ -41,8 +47,15 @@ module Muse::Dl
headers["Cookie"] = cookie
end
request = Crest::Request.new(:get, url, headers: headers, max_redirects: 0, handle_errors: false)
begin
response = request.execute
rescue ex : IO::TimeoutError
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.")
end
# TODO: Add validation for the downloaded file (should be PDF)
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
if !response.success?
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}")
end
@ -67,7 +80,6 @@ module Muse::Dl
raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}")
end
end
end
pdftk = Muse::Dl::Pdftk.new tmp_path