From 4e435dd3abffd1b140f29eae454f9bb5bbf014ec Mon Sep 17 00:00:00 2001 From: Prad Nelluru Date: Sun, 19 Apr 2020 17:27:54 -0400 Subject: [PATCH] Add 60s timeout to downloads. Do backoff for all errors. --- src/fetch.cr | 56 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/src/fetch.cr b/src/fetch.cr index aefe540..1bd557e 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -4,7 +4,8 @@ require "myhtml" module Muse::Dl class Fetch - USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" + USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" + DOWNLOAD_TIMEOUT_SECS = 60 HEADERS = { "User-Agent" => USER_AGENT, @@ -33,6 +34,11 @@ module Muse::Dl # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" + uri = URI.parse(url) + http_client = HTTP::Client.new(uri) + # Raise a IO::TimeoutError after 60 seconds. + http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS + headers = HEADERS.merge({ "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", }) @@ -41,31 +47,37 @@ module Muse::Dl headers["Cookie"] = cookie end - # TODO: Add validation for the downloaded file (should be PDF) - Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response| - if !response.success? - raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}") - end + request = Crest::Request.new(:get, url, headers: headers, max_redirects: 0, handle_errors: false) - content_type = response.headers["Content-Type"] - if content_type.is_a? String - if /html/.match content_type - puts response - response.body_io.each_line do |line| - # https://muse.jhu.edu/chapter/2383438/pdf - # https://muse.jhu.edu/book/67393 - # Errors are Unable to determine page runs / Unable to construct chapter PDF - if /Unable to/.match line - raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}") - end + begin + response = request.execute + rescue ex : IO::TimeoutError + raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") + end + + # TODO: Add validation for the downloaded file (should be PDF) + if !response.success? + raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}") + end + + content_type = response.headers["Content-Type"] + if content_type.is_a? String + if /html/.match content_type + puts response + response.body_io.each_line do |line| + # https://muse.jhu.edu/chapter/2383438/pdf + # https://muse.jhu.edu/book/67393 + # Errors are Unable to determine page runs / Unable to construct chapter PDF + if /Unable to/.match line + raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}") end end end - File.open(tmp_pdf_file, "w") do |file| - IO.copy(response.body_io, file) - if file.size == 0 - raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}") - end + end + File.open(tmp_pdf_file, "w") do |file| + IO.copy(response.body_io, file) + if file.size == 0 + raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}") end end