mirror of
https://github.com/captn3m0/muse-dl.git
synced 2024-09-21 00:57:09 +00:00
Add 60s timeout to downloads. Do backoff for all errors.
This commit is contained in:
parent
762164e223
commit
4e435dd3ab
56
src/fetch.cr
56
src/fetch.cr
@ -4,7 +4,8 @@ require "myhtml"
|
||||
|
||||
module Muse::Dl
|
||||
class Fetch
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
|
||||
DOWNLOAD_TIMEOUT_SECS = 60
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent" => USER_AGENT,
|
||||
@ -33,6 +34,11 @@ module Muse::Dl
|
||||
|
||||
# TODO: Remove this hardcoding, and make this more generic by generating it within the Book class
|
||||
url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
|
||||
uri = URI.parse(url)
|
||||
http_client = HTTP::Client.new(uri)
|
||||
# Raise a IO::TimeoutError after 60 seconds.
|
||||
http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS
|
||||
|
||||
headers = HEADERS.merge({
|
||||
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
|
||||
})
|
||||
@ -41,31 +47,37 @@ module Muse::Dl
|
||||
headers["Cookie"] = cookie
|
||||
end
|
||||
|
||||
# TODO: Add validation for the downloaded file (should be PDF)
|
||||
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
|
||||
if !response.success?
|
||||
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}")
|
||||
end
|
||||
request = Crest::Request.new(:get, url, headers: headers, max_redirects: 0, handle_errors: false)
|
||||
|
||||
content_type = response.headers["Content-Type"]
|
||||
if content_type.is_a? String
|
||||
if /html/.match content_type
|
||||
puts response
|
||||
response.body_io.each_line do |line|
|
||||
# https://muse.jhu.edu/chapter/2383438/pdf
|
||||
# https://muse.jhu.edu/book/67393
|
||||
# Errors are Unable to determine page runs / Unable to construct chapter PDF
|
||||
if /Unable to/.match line
|
||||
raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}")
|
||||
end
|
||||
begin
|
||||
response = request.execute
|
||||
rescue ex : IO::TimeoutError
|
||||
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.")
|
||||
end
|
||||
|
||||
# TODO: Add validation for the downloaded file (should be PDF)
|
||||
if !response.success?
|
||||
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}")
|
||||
end
|
||||
|
||||
content_type = response.headers["Content-Type"]
|
||||
if content_type.is_a? String
|
||||
if /html/.match content_type
|
||||
puts response
|
||||
response.body_io.each_line do |line|
|
||||
# https://muse.jhu.edu/chapter/2383438/pdf
|
||||
# https://muse.jhu.edu/book/67393
|
||||
# Errors are Unable to determine page runs / Unable to construct chapter PDF
|
||||
if /Unable to/.match line
|
||||
raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}")
|
||||
end
|
||||
end
|
||||
end
|
||||
File.open(tmp_pdf_file, "w") do |file|
|
||||
IO.copy(response.body_io, file)
|
||||
if file.size == 0
|
||||
raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}")
|
||||
end
|
||||
end
|
||||
File.open(tmp_pdf_file, "w") do |file|
|
||||
IO.copy(response.body_io, file)
|
||||
if file.size == 0
|
||||
raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}")
|
||||
end
|
||||
end
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user