muse-dl/src/fetch.cr

76 lines
2.5 KiB
Crystal
Raw Normal View History

2020-03-28 19:29:47 +00:00
require "crest"
2020-03-28 21:07:14 +00:00
require "./errors/*"
2020-03-28 19:29:47 +00:00
module Muse::Dl
class Fetch
2020-03-28 22:22:57 +00:00
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
2020-03-29 12:21:01 +00:00
HEADERS = {
2020-03-28 22:22:57 +00:00
"User-Agent" => USER_AGENT,
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language" => "en-US,en;q=0.5",
"DNT" => "1",
"Cookie" => "session=124.123.104.8.1585420925750331; session=25719682.5a1ef8cb90ec8",
"Connection" => "keep-alive",
"Upgrade-Insecure-Requests" => "1",
"Cache-Control" => "max-age=0",
}
2020-03-29 12:21:01 +00:00
def self.chapter_file_name(id : String, tmp_path : String)
"#{tmp_path}/chapter-#{id}.pdf"
end
def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, add_bookmark = true)
final_pdf_file = chapter_file_name chapter_id, tmp_path
tmp_pdf_file = "#{final_pdf_file}.tmp"
if File.exists? final_pdf_file
puts "#{chapter_id} already downloaded"
return
end
2020-03-28 22:22:57 +00:00
url = "https://muse.jhu.edu/chapter/#{chapter_id}"
headers = HEADERS.merge({
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
})
2020-03-29 12:21:01 +00:00
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
File.open(tmp_pdf_file, "w") do |file|
IO.copy(response.body_io, file)
2020-03-28 22:22:57 +00:00
end
end
2020-03-29 12:21:01 +00:00
pdftk = Muse::Dl::Pdftk.new tmp_path
pdftk.strip_first_page tmp_pdf_file
if add_bookmark
# Run pdftk and add the bookmark to the file
pdftk.add_bookmark tmp_pdf_file, chapter_title
end
# Now we can move the file to the proper PDF filename
File.rename tmp_pdf_file, final_pdf_file
puts "Downloaded #{chapter_id}"
2020-03-28 22:22:57 +00:00
end
def self.get_info(url : String) : Muse::Dl::Thing | Nil
2020-03-28 19:29:47 +00:00
match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url
if match
begin
2020-03-28 22:22:57 +00:00
response = Crest.get(url).to_s
2020-03-28 19:51:36 +00:00
case match[1]
when "book"
return Muse::Dl::Book.new response
when "journal"
return Muse::Dl::Journal.new response
end
2020-03-28 19:29:47 +00:00
rescue ex : Crest::NotFound
2020-03-28 22:22:57 +00:00
raise Muse::Dl::Errors::InvalidLink.new
2020-03-28 19:29:47 +00:00
end
else
2020-03-28 22:22:57 +00:00
raise Muse::Dl::Errors::InvalidLink.new
2020-03-28 19:29:47 +00:00
end
end
end
end