mirror of https://github.com/captn3m0/muse-dl.git
Adds stitching and first page strip
This commit is contained in:
parent
376adf28cb
commit
d981000d14
|
@ -3,3 +3,4 @@
|
|||
/bin/
|
||||
/.shards/
|
||||
*.dwarf
|
||||
*.pdf
|
|
@ -2,7 +2,7 @@ version: 1.0
|
|||
shards:
|
||||
crest:
|
||||
github: mamantoha/crest
|
||||
version: 0.24.0
|
||||
version: 0.24.1
|
||||
|
||||
http-client-digest_auth:
|
||||
github: mamantoha/http-client-digest_auth
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
module Muse::Dl::Errors
|
||||
class CorruptFile < Exception
|
||||
end
|
||||
end
|
|
@ -0,0 +1,4 @@
|
|||
module Muse::Dl::Errors
|
||||
class MissingChapter < Exception
|
||||
end
|
||||
end
|
47
src/fetch.cr
47
src/fetch.cr
|
@ -4,9 +4,7 @@ require "./errors/*"
|
|||
module Muse::Dl
|
||||
class Fetch
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
|
||||
# TODO: Add support for cookies?
|
||||
# "Cookie" => "session=124.123.104.8.1585388207021325",
|
||||
HEADERS = {
|
||||
HEADERS = {
|
||||
"User-Agent" => USER_AGENT,
|
||||
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language" => "en-US,en;q=0.5",
|
||||
|
@ -17,27 +15,42 @@ module Muse::Dl
|
|||
"Cache-Control" => "max-age=0",
|
||||
}
|
||||
|
||||
def self.save_chapter(tmp_path : String, chapter_id : String, add_bookmark = true)
|
||||
def self.chapter_file_name(id : String, tmp_path : String)
|
||||
"#{tmp_path}/chapter-#{id}.pdf"
|
||||
end
|
||||
|
||||
def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, add_bookmark = true)
|
||||
final_pdf_file = chapter_file_name chapter_id, tmp_path
|
||||
tmp_pdf_file = "#{final_pdf_file}.tmp"
|
||||
|
||||
if File.exists? final_pdf_file
|
||||
puts "#{chapter_id} already downloaded"
|
||||
return
|
||||
end
|
||||
|
||||
url = "https://muse.jhu.edu/chapter/#{chapter_id}"
|
||||
headers = HEADERS.merge({
|
||||
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
|
||||
})
|
||||
|
||||
begin
|
||||
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
|
||||
File.open("#{tmp_path}/chapter-#{chapter_id}.pdf", "w") do |file|
|
||||
IO.copy(response.body_io, file)
|
||||
end
|
||||
rescue e : Exception
|
||||
puts e.message
|
||||
raise e
|
||||
# We catch a temporary redirect
|
||||
# https://github.com/mamantoha/crest/blob/29a690726902c71884f9c80f0f9565256e74b7fd/src/crest/exceptions.cr#L20-L28
|
||||
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
|
||||
File.open(tmp_pdf_file, "w") do |file|
|
||||
IO.copy(response.body_io, file)
|
||||
end
|
||||
rescue e : Exception
|
||||
puts "FICK"
|
||||
raise e
|
||||
end
|
||||
|
||||
pdftk = Muse::Dl::Pdftk.new tmp_path
|
||||
|
||||
pdftk.strip_first_page tmp_pdf_file
|
||||
|
||||
if add_bookmark
|
||||
# Run pdftk and add the bookmark to the file
|
||||
pdftk.add_bookmark tmp_pdf_file, chapter_title
|
||||
end
|
||||
|
||||
# Now we can move the file to the proper PDF filename
|
||||
File.rename tmp_pdf_file, final_pdf_file
|
||||
puts "Downloaded #{chapter_id}"
|
||||
end
|
||||
|
||||
def self.get_info(url : String) : Muse::Dl::Thing | Nil
|
||||
|
|
|
@ -16,8 +16,12 @@ module Muse::Dl
|
|||
|
||||
if thing.is_a? Muse::Dl::Book
|
||||
thing.chapters.each do |chapter|
|
||||
Fetch.save_chapter(parser.tmp, chapter[0])
|
||||
Fetch.save_chapter(parser.tmp, chapter[0], chapter[1], parser.bookmarks)
|
||||
end
|
||||
chapter_ids = thing.chapters.map { |c| c[0] }
|
||||
pdf_builder = Pdftk.new(parser.tmp)
|
||||
pdf_builder.stitch(parser.output, chapter_ids)
|
||||
puts "Saved final output to #{parser.output}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
59
src/pdftk.cr
59
src/pdftk.cr
|
@ -1,18 +1,69 @@
|
|||
require "process"
|
||||
require "file"
|
||||
require "./fetch"
|
||||
require "./errors/*"
|
||||
|
||||
module Muse::Dl
|
||||
class Pdftk
|
||||
PDFTK_BINARY_NAME = "pdftk"
|
||||
@binary : String | Nil
|
||||
@binary = "/usr/sbin/pdftk"
|
||||
@tmp_file_path : String
|
||||
|
||||
getter :binary
|
||||
|
||||
def initialize
|
||||
@binary = Process.find_executable(Pdftk::PDFTK_BINARY_NAME)
|
||||
if !@binary
|
||||
def initialize(tmp_file_path : String)
|
||||
@tmp_file_path = tmp_file_path
|
||||
|
||||
possible_binary = Process.find_executable(Pdftk::PDFTK_BINARY_NAME)
|
||||
if possible_binary
|
||||
@binary = possible_binary
|
||||
else
|
||||
puts "Could not find pdftk binary, exiting"
|
||||
Process.exit(1)
|
||||
end
|
||||
end
|
||||
|
||||
def execute(args : Array(String))
|
||||
Process.run(@binary, args)
|
||||
end
|
||||
|
||||
def strip_first_page(input_file : String)
|
||||
output_pdf = File.tempfile("muse-dl-temp", ".pdf")
|
||||
execute [input_file, "cat", "2-end", "output", output_pdf.path]
|
||||
File.rename output_pdf.path, input_file
|
||||
end
|
||||
|
||||
def add_bookmark(input_file : String, title : String)
|
||||
output_pdf = File.tempfile("muse-dl-temp", ".pdf")
|
||||
bookmark_text_file = File.tempfile("muse-dl-chapter-tmp", ".txt")
|
||||
bookmark_text = <<-END
|
||||
BookmarkBegin
|
||||
BookmarkTitle: #{title}
|
||||
BookmarkLevel: 1
|
||||
BookmarkPageNumber: 1
|
||||
END
|
||||
File.write(bookmark_text_file.path, bookmark_text)
|
||||
execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path]
|
||||
|
||||
# Cleanup
|
||||
bookmark_text_file.delete
|
||||
File.rename output_pdf.path, input_file
|
||||
end
|
||||
|
||||
def stitch(output_file : String, chapter_ids : Array(String))
|
||||
# Do some sanity checks on each Chapter PDF
|
||||
chapter_ids.each do |id|
|
||||
raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
|
||||
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
|
||||
end
|
||||
|
||||
# Now let's stitch them together
|
||||
|
||||
chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) }
|
||||
args = chapter_files + ["cat", "output", output_file]
|
||||
execute args
|
||||
|
||||
# TODO: Validate final file here
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue