Adds stitching and first page strip

This commit is contained in:
Nemo 2020-03-29 17:51:01 +05:30
parent 376adf28cb
commit d981000d14
7 changed files with 100 additions and 23 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@
/bin/
/.shards/
*.dwarf
*.pdf

View File

@ -2,7 +2,7 @@ version: 1.0
shards:
crest:
github: mamantoha/crest
version: 0.24.0
version: 0.24.1
http-client-digest_auth:
github: mamantoha/http-client-digest_auth

View File

@ -0,0 +1,4 @@
module Muse::Dl::Errors
class CorruptFile < Exception
end
end

View File

@ -0,0 +1,4 @@
module Muse::Dl::Errors
class MissingChapter < Exception
end
end

View File

@ -4,9 +4,7 @@ require "./errors/*"
module Muse::Dl
class Fetch
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
# TODO: Add support for cookies?
# "Cookie" => "session=124.123.104.8.1585388207021325",
HEADERS = {
HEADERS = {
"User-Agent" => USER_AGENT,
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language" => "en-US,en;q=0.5",
@ -17,27 +15,42 @@ module Muse::Dl
"Cache-Control" => "max-age=0",
}
def self.save_chapter(tmp_path : String, chapter_id : String, add_bookmark = true)
def self.chapter_file_name(id : String, tmp_path : String)
"#{tmp_path}/chapter-#{id}.pdf"
end
def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, add_bookmark = true)
final_pdf_file = chapter_file_name chapter_id, tmp_path
tmp_pdf_file = "#{final_pdf_file}.tmp"
if File.exists? final_pdf_file
puts "#{chapter_id} already downloaded"
return
end
url = "https://muse.jhu.edu/chapter/#{chapter_id}"
headers = HEADERS.merge({
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
})
begin
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
File.open("#{tmp_path}/chapter-#{chapter_id}.pdf", "w") do |file|
IO.copy(response.body_io, file)
end
rescue e : Exception
puts e.message
raise e
# We catch a temporary redirect
# https://github.com/mamantoha/crest/blob/29a690726902c71884f9c80f0f9565256e74b7fd/src/crest/exceptions.cr#L20-L28
Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response|
File.open(tmp_pdf_file, "w") do |file|
IO.copy(response.body_io, file)
end
rescue e : Exception
puts "FICK"
raise e
end
pdftk = Muse::Dl::Pdftk.new tmp_path
pdftk.strip_first_page tmp_pdf_file
if add_bookmark
# Run pdftk and add the bookmark to the file
pdftk.add_bookmark tmp_pdf_file, chapter_title
end
# Now we can move the file to the proper PDF filename
File.rename tmp_pdf_file, final_pdf_file
puts "Downloaded #{chapter_id}"
end
def self.get_info(url : String) : Muse::Dl::Thing | Nil

View File

@ -16,8 +16,12 @@ module Muse::Dl
if thing.is_a? Muse::Dl::Book
thing.chapters.each do |chapter|
Fetch.save_chapter(parser.tmp, chapter[0])
Fetch.save_chapter(parser.tmp, chapter[0], chapter[1], parser.bookmarks)
end
chapter_ids = thing.chapters.map { |c| c[0] }
pdf_builder = Pdftk.new(parser.tmp)
pdf_builder.stitch(parser.output, chapter_ids)
puts "Saved final output to #{parser.output}"
end
end
end

View File

@ -1,18 +1,69 @@
require "process"
require "file"
require "./fetch"
require "./errors/*"
module Muse::Dl
class Pdftk
PDFTK_BINARY_NAME = "pdftk"
@binary : String | Nil
@binary = "/usr/sbin/pdftk"
@tmp_file_path : String
getter :binary
def initialize
@binary = Process.find_executable(Pdftk::PDFTK_BINARY_NAME)
if !@binary
def initialize(tmp_file_path : String)
@tmp_file_path = tmp_file_path
possible_binary = Process.find_executable(Pdftk::PDFTK_BINARY_NAME)
if possible_binary
@binary = possible_binary
else
puts "Could not find pdftk binary, exiting"
Process.exit(1)
end
end
def execute(args : Array(String))
Process.run(@binary, args)
end
def strip_first_page(input_file : String)
output_pdf = File.tempfile("muse-dl-temp", ".pdf")
execute [input_file, "cat", "2-end", "output", output_pdf.path]
File.rename output_pdf.path, input_file
end
def add_bookmark(input_file : String, title : String)
output_pdf = File.tempfile("muse-dl-temp", ".pdf")
bookmark_text_file = File.tempfile("muse-dl-chapter-tmp", ".txt")
bookmark_text = <<-END
BookmarkBegin
BookmarkTitle: #{title}
BookmarkLevel: 1
BookmarkPageNumber: 1
END
File.write(bookmark_text_file.path, bookmark_text)
execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path]
# Cleanup
bookmark_text_file.delete
File.rename output_pdf.path, input_file
end
def stitch(output_file : String, chapter_ids : Array(String))
# Do some sanity checks on each Chapter PDF
chapter_ids.each do |id|
raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
end
# Now let's stitch them together
chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) }
args = chapter_files + ["cat", "output", output_file]
execute args
# TODO: Validate final file here
end
end
end