mirror of https://github.com/captn3m0/muse-dl.git
Adds stitching and first page strip
This commit is contained in:
parent
376adf28cb
commit
d981000d14
|
@ -3,3 +3,4 @@
|
||||||
/bin/
|
/bin/
|
||||||
/.shards/
|
/.shards/
|
||||||
*.dwarf
|
*.dwarf
|
||||||
|
*.pdf
|
|
@ -2,7 +2,7 @@ version: 1.0
|
||||||
shards:
|
shards:
|
||||||
crest:
|
crest:
|
||||||
github: mamantoha/crest
|
github: mamantoha/crest
|
||||||
version: 0.24.0
|
version: 0.24.1
|
||||||
|
|
||||||
http-client-digest_auth:
|
http-client-digest_auth:
|
||||||
github: mamantoha/http-client-digest_auth
|
github: mamantoha/http-client-digest_auth
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
module Muse::Dl::Errors
|
||||||
|
class CorruptFile < Exception
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,4 @@
|
||||||
|
module Muse::Dl::Errors
|
||||||
|
class MissingChapter < Exception
|
||||||
|
end
|
||||||
|
end
|
47
src/fetch.cr
47
src/fetch.cr
|
@ -4,9 +4,7 @@ require "./errors/*"
|
||||||
module Muse::Dl
|
module Muse::Dl
|
||||||
class Fetch
|
class Fetch
|
||||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
|
||||||
# TODO: Add support for cookies?
|
HEADERS = {
|
||||||
# "Cookie" => "session=124.123.104.8.1585388207021325",
|
|
||||||
HEADERS = {
|
|
||||||
"User-Agent" => USER_AGENT,
|
"User-Agent" => USER_AGENT,
|
||||||
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
"Accept-Language" => "en-US,en;q=0.5",
|
"Accept-Language" => "en-US,en;q=0.5",
|
||||||
|
@ -17,27 +15,42 @@ module Muse::Dl
|
||||||
"Cache-Control" => "max-age=0",
|
"Cache-Control" => "max-age=0",
|
||||||
}
|
}
|
||||||
|
|
||||||
def self.save_chapter(tmp_path : String, chapter_id : String, add_bookmark = true)
|
def self.chapter_file_name(id : String, tmp_path : String)
|
||||||
|
"#{tmp_path}/chapter-#{id}.pdf"
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, add_bookmark = true)
|
||||||
|
final_pdf_file = chapter_file_name chapter_id, tmp_path
|
||||||
|
tmp_pdf_file = "#{final_pdf_file}.tmp"
|
||||||
|
|
||||||
|
if File.exists? final_pdf_file
|
||||||
|
puts "#{chapter_id} already downloaded"
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
url = "https://muse.jhu.edu/chapter/#{chapter_id}"
|
url = "https://muse.jhu.edu/chapter/#{chapter_id}"
|
||||||
headers = HEADERS.merge({
|
headers = HEADERS.merge({
|
||||||
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
|
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
|
||||||
})
|
})
|
||||||
|
|
||||||
begin
|
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
|
||||||
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
|
File.open(tmp_pdf_file, "w") do |file|
|
||||||
File.open("#{tmp_path}/chapter-#{chapter_id}.pdf", "w") do |file|
|
IO.copy(response.body_io, file)
|
||||||
IO.copy(response.body_io, file)
|
|
||||||
end
|
|
||||||
rescue e : Exception
|
|
||||||
puts e.message
|
|
||||||
raise e
|
|
||||||
# We catch a temporary redirect
|
|
||||||
# https://github.com/mamantoha/crest/blob/29a690726902c71884f9c80f0f9565256e74b7fd/src/crest/exceptions.cr#L20-L28
|
|
||||||
end
|
end
|
||||||
rescue e : Exception
|
|
||||||
puts "FICK"
|
|
||||||
raise e
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
pdftk = Muse::Dl::Pdftk.new tmp_path
|
||||||
|
|
||||||
|
pdftk.strip_first_page tmp_pdf_file
|
||||||
|
|
||||||
|
if add_bookmark
|
||||||
|
# Run pdftk and add the bookmark to the file
|
||||||
|
pdftk.add_bookmark tmp_pdf_file, chapter_title
|
||||||
|
end
|
||||||
|
|
||||||
|
# Now we can move the file to the proper PDF filename
|
||||||
|
File.rename tmp_pdf_file, final_pdf_file
|
||||||
|
puts "Downloaded #{chapter_id}"
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.get_info(url : String) : Muse::Dl::Thing | Nil
|
def self.get_info(url : String) : Muse::Dl::Thing | Nil
|
||||||
|
|
|
@ -16,8 +16,12 @@ module Muse::Dl
|
||||||
|
|
||||||
if thing.is_a? Muse::Dl::Book
|
if thing.is_a? Muse::Dl::Book
|
||||||
thing.chapters.each do |chapter|
|
thing.chapters.each do |chapter|
|
||||||
Fetch.save_chapter(parser.tmp, chapter[0])
|
Fetch.save_chapter(parser.tmp, chapter[0], chapter[1], parser.bookmarks)
|
||||||
end
|
end
|
||||||
|
chapter_ids = thing.chapters.map { |c| c[0] }
|
||||||
|
pdf_builder = Pdftk.new(parser.tmp)
|
||||||
|
pdf_builder.stitch(parser.output, chapter_ids)
|
||||||
|
puts "Saved final output to #{parser.output}"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
59
src/pdftk.cr
59
src/pdftk.cr
|
@ -1,18 +1,69 @@
|
||||||
require "process"
|
require "process"
|
||||||
|
require "file"
|
||||||
|
require "./fetch"
|
||||||
|
require "./errors/*"
|
||||||
|
|
||||||
module Muse::Dl
|
module Muse::Dl
|
||||||
class Pdftk
|
class Pdftk
|
||||||
PDFTK_BINARY_NAME = "pdftk"
|
PDFTK_BINARY_NAME = "pdftk"
|
||||||
@binary : String | Nil
|
@binary = "/usr/sbin/pdftk"
|
||||||
|
@tmp_file_path : String
|
||||||
|
|
||||||
getter :binary
|
getter :binary
|
||||||
|
|
||||||
def initialize
|
def initialize(tmp_file_path : String)
|
||||||
@binary = Process.find_executable(Pdftk::PDFTK_BINARY_NAME)
|
@tmp_file_path = tmp_file_path
|
||||||
if !@binary
|
|
||||||
|
possible_binary = Process.find_executable(Pdftk::PDFTK_BINARY_NAME)
|
||||||
|
if possible_binary
|
||||||
|
@binary = possible_binary
|
||||||
|
else
|
||||||
puts "Could not find pdftk binary, exiting"
|
puts "Could not find pdftk binary, exiting"
|
||||||
Process.exit(1)
|
Process.exit(1)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def execute(args : Array(String))
|
||||||
|
Process.run(@binary, args)
|
||||||
|
end
|
||||||
|
|
||||||
|
def strip_first_page(input_file : String)
|
||||||
|
output_pdf = File.tempfile("muse-dl-temp", ".pdf")
|
||||||
|
execute [input_file, "cat", "2-end", "output", output_pdf.path]
|
||||||
|
File.rename output_pdf.path, input_file
|
||||||
|
end
|
||||||
|
|
||||||
|
def add_bookmark(input_file : String, title : String)
|
||||||
|
output_pdf = File.tempfile("muse-dl-temp", ".pdf")
|
||||||
|
bookmark_text_file = File.tempfile("muse-dl-chapter-tmp", ".txt")
|
||||||
|
bookmark_text = <<-END
|
||||||
|
BookmarkBegin
|
||||||
|
BookmarkTitle: #{title}
|
||||||
|
BookmarkLevel: 1
|
||||||
|
BookmarkPageNumber: 1
|
||||||
|
END
|
||||||
|
File.write(bookmark_text_file.path, bookmark_text)
|
||||||
|
execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path]
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
bookmark_text_file.delete
|
||||||
|
File.rename output_pdf.path, input_file
|
||||||
|
end
|
||||||
|
|
||||||
|
def stitch(output_file : String, chapter_ids : Array(String))
|
||||||
|
# Do some sanity checks on each Chapter PDF
|
||||||
|
chapter_ids.each do |id|
|
||||||
|
raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
|
||||||
|
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
|
||||||
|
end
|
||||||
|
|
||||||
|
# Now let's stitch them together
|
||||||
|
|
||||||
|
chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) }
|
||||||
|
args = chapter_files + ["cat", "output", output_file]
|
||||||
|
execute args
|
||||||
|
|
||||||
|
# TODO: Validate final file here
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue