mirror of https://github.com/captn3m0/muse-dl.git
commit
ebf1b57e22
|
@ -13,7 +13,6 @@ describe Muse::Dl::Parser do
|
||||||
parser = Muse::Dl::Parser.new(["https://muse.jhu.edu/book/68534"])
|
parser = Muse::Dl::Parser.new(["https://muse.jhu.edu/book/68534"])
|
||||||
parser.bookmarks.should eq true
|
parser.bookmarks.should eq true
|
||||||
parser.cleanup.should eq true
|
parser.cleanup.should eq true
|
||||||
parser.tmp.should eq "/tmp"
|
|
||||||
parser.output.should eq "tempfilename.pdf"
|
parser.output.should eq "tempfilename.pdf"
|
||||||
parser.url.should eq "https://muse.jhu.edu/book/68534"
|
parser.url.should eq "https://muse.jhu.edu/book/68534"
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
module Muse::Dl::Errors
|
||||||
|
class DownloadError < Exception
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,4 @@
|
||||||
|
module Muse::Dl::Errors
|
||||||
|
class PDFOperationError < Exception
|
||||||
|
end
|
||||||
|
end
|
54
src/fetch.cr
54
src/fetch.cr
|
@ -4,7 +4,8 @@ require "myhtml"
|
||||||
|
|
||||||
module Muse::Dl
|
module Muse::Dl
|
||||||
class Fetch
|
class Fetch
|
||||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
|
||||||
|
DOWNLOAD_TIMEOUT_SECS = 60
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent" => USER_AGENT,
|
"User-Agent" => USER_AGENT,
|
||||||
|
@ -33,6 +34,11 @@ module Muse::Dl
|
||||||
|
|
||||||
# TODO: Remove this hardcoding, and make this more generic by generating it within the Book class
|
# TODO: Remove this hardcoding, and make this more generic by generating it within the Book class
|
||||||
url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
|
url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
|
||||||
|
uri = URI.parse(url)
|
||||||
|
http_client = HTTP::Client.new(uri)
|
||||||
|
# Raise a IO::TimeoutError after 60 seconds.
|
||||||
|
http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS
|
||||||
|
|
||||||
headers = HEADERS.merge({
|
headers = HEADERS.merge({
|
||||||
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
|
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
|
||||||
})
|
})
|
||||||
|
@ -41,25 +47,37 @@ module Muse::Dl
|
||||||
headers["Cookie"] = cookie
|
headers["Cookie"] = cookie
|
||||||
end
|
end
|
||||||
|
|
||||||
|
request = Crest::Request.new(:get, url, headers: headers, max_redirects: 0, handle_errors: false)
|
||||||
|
|
||||||
|
begin
|
||||||
|
response = request.execute
|
||||||
|
rescue ex : IO::TimeoutError
|
||||||
|
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.")
|
||||||
|
end
|
||||||
|
|
||||||
# TODO: Add validation for the downloaded file (should be PDF)
|
# TODO: Add validation for the downloaded file (should be PDF)
|
||||||
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
|
if !response.success?
|
||||||
# puts response.headers["Content-Type"]
|
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. HTTP response code: #{response.status}")
|
||||||
content_type = response.headers["Content-Type"]
|
end
|
||||||
if content_type.is_a? String
|
|
||||||
if /html/.match content_type
|
content_type = response.headers["Content-Type"]
|
||||||
puts response
|
if content_type.is_a? String
|
||||||
response.body_io.each_line do |line|
|
if /html/.match content_type
|
||||||
# https://muse.jhu.edu/chapter/2383438/pdf
|
puts response
|
||||||
# https://muse.jhu.edu/book/67393
|
response.body.each_line do |line|
|
||||||
# Errors are Unable to determine page runs / Unable to construct chapter PDF
|
# https://muse.jhu.edu/chapter/2383438/pdf
|
||||||
if /Unable to/.match line
|
# https://muse.jhu.edu/book/67393
|
||||||
raise Muse::Dl::Errors::MuseCorruptPDF.new
|
# Errors are Unable to determine page runs / Unable to construct chapter PDF
|
||||||
end
|
if /Unable to/.match line
|
||||||
|
raise Muse::Dl::Errors::MuseCorruptPDF.new("Error: MUSE is unable to generate PDF for #{url}")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
File.open(tmp_pdf_file, "w") do |file|
|
end
|
||||||
IO.copy(response.body_io, file)
|
File.open(tmp_pdf_file, "w") do |file|
|
||||||
|
file << response.body
|
||||||
|
if file.size == 0
|
||||||
|
raise Muse::Dl::Errors::DownloadError.new("Error: downloaded chapter file size is zero. Response Content-Length header was #{headers["Content-Length"]}")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -89,10 +107,10 @@ module Muse::Dl
|
||||||
return Muse::Dl::Journal.new response
|
return Muse::Dl::Journal.new response
|
||||||
end
|
end
|
||||||
rescue ex : Crest::NotFound
|
rescue ex : Crest::NotFound
|
||||||
raise Muse::Dl::Errors::InvalidLink.new
|
raise Muse::Dl::Errors::InvalidLink.new("Error - could not download url: #{url}")
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
raise Muse::Dl::Errors::InvalidLink.new
|
raise Muse::Dl::Errors::InvalidLink.new("Error - url does not match expected pattern: #{url}")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -65,14 +65,26 @@ module Muse::Dl
|
||||||
def self.run(args : Array(String))
|
def self.run(args : Array(String))
|
||||||
parser = Parser.new(args)
|
parser = Parser.new(args)
|
||||||
|
|
||||||
|
delay_secs = 1
|
||||||
input_list = parser.input_list
|
input_list = parser.input_list
|
||||||
if input_list
|
if input_list
|
||||||
File.each_line input_list do |url|
|
File.each_line input_list do |url|
|
||||||
# TODO: Change this to nil
|
begin
|
||||||
parser.reset_output_file
|
# TODO: Change this to nil
|
||||||
parser.url = url.strip
|
parser.reset_output_file
|
||||||
# Ask the download process to not quit the process, and return instead
|
parser.url = url.strip
|
||||||
Main.dl parser
|
# Ask the download process to not quit the process, and return instead
|
||||||
|
Main.dl parser
|
||||||
|
if delay_secs >= 2
|
||||||
|
delay_secs /= 2
|
||||||
|
end
|
||||||
|
rescue ex
|
||||||
|
puts ex.message
|
||||||
|
puts ex.backtrace.join("\n ")
|
||||||
|
puts "Error. Skipping book: #{url}. Waiting for #{delay_secs} seconds before continuing."
|
||||||
|
sleep(delay_secs)
|
||||||
|
delay_secs *= 2
|
||||||
|
end
|
||||||
end
|
end
|
||||||
elsif parser.url
|
elsif parser.url
|
||||||
Main.dl parser
|
Main.dl parser
|
||||||
|
|
32
src/pdftk.cr
32
src/pdftk.cr
|
@ -28,14 +28,22 @@ module Muse::Dl
|
||||||
def execute(args : Array(String))
|
def execute(args : Array(String))
|
||||||
binary = @binary
|
binary = @binary
|
||||||
if binary
|
if binary
|
||||||
Process.run(binary, args)
|
status = Process.run(binary, args, output: STDOUT, error: STDERR)
|
||||||
|
if !status.success?
|
||||||
|
puts "pdftk command failed: #{binary} #{args.join(" ")}"
|
||||||
|
end
|
||||||
|
return status.success?
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def strip_first_page(input_file : String)
|
def strip_first_page(input_file : String)
|
||||||
output_pdf = File.tempfile("muse-dl-temp", ".pdf")
|
output_pdf = File.tempfile("muse-dl-temp", ".pdf")
|
||||||
execute [input_file, "cat", "2-end", "output", output_pdf.path]
|
is_success = execute [input_file, "cat", "2-end", "output", output_pdf.path]
|
||||||
File.rename output_pdf.path, input_file
|
if is_success
|
||||||
|
File.rename output_pdf.path, input_file
|
||||||
|
else
|
||||||
|
raise Muse::Dl::Errors::PDFOperationError.new("Error stripping first page of chapter.")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def add_bookmark(input_file : String, title : String)
|
def add_bookmark(input_file : String, title : String)
|
||||||
|
@ -48,11 +56,15 @@ module Muse::Dl
|
||||||
BookmarkPageNumber: 1
|
BookmarkPageNumber: 1
|
||||||
END
|
END
|
||||||
File.write(bookmark_text_file.path, bookmark_text)
|
File.write(bookmark_text_file.path, bookmark_text)
|
||||||
execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path]
|
is_success = execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path]
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
bookmark_text_file.delete
|
bookmark_text_file.delete
|
||||||
File.rename output_pdf.path, input_file
|
if is_success
|
||||||
|
File.rename output_pdf.path, input_file
|
||||||
|
else
|
||||||
|
raise Muse::Dl::Errors::PDFOperationError.new("Error adding bookmark metadata to chapter.")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def add_metadata(input_file : File, output_file : String, book : Book)
|
def add_metadata(input_file : File, output_file : String, book : Book)
|
||||||
|
@ -95,7 +107,10 @@ module Muse::Dl
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
File.write(metadata_text_file.path, text)
|
File.write(metadata_text_file.path, text)
|
||||||
execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
|
is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
|
||||||
|
if !is_success
|
||||||
|
raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.")
|
||||||
|
end
|
||||||
metadata_text_file.delete
|
metadata_text_file.delete
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -111,9 +126,12 @@ module Muse::Dl
|
||||||
|
|
||||||
chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) }
|
chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) }
|
||||||
args = chapter_files + ["cat", "output", output_file.path]
|
args = chapter_files + ["cat", "output", output_file.path]
|
||||||
execute args
|
is_success = execute args
|
||||||
|
|
||||||
# TODO: Validate final file here
|
# TODO: Validate final file here
|
||||||
|
if !is_success
|
||||||
|
raise Muse::Dl::Errors::PDFOperationError.new("Error stitching chapters together.")
|
||||||
|
end
|
||||||
|
|
||||||
return output_file
|
return output_file
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue