From d981000d14b4dc32756206918fd745a214862203 Mon Sep 17 00:00:00 2001 From: Nemo Date: Sun, 29 Mar 2020 17:51:01 +0530 Subject: [PATCH] Adds stitching and first page strip --- .gitignore | 1 + shard.lock | 2 +- src/errors/corrupt_file.cr | 4 +++ src/errors/missing_chapter.cr | 4 +++ src/fetch.cr | 47 ++++++++++++++++++---------- src/muse-dl.cr | 6 +++- src/pdftk.cr | 59 ++++++++++++++++++++++++++++++++--- 7 files changed, 100 insertions(+), 23 deletions(-) create mode 100644 src/errors/corrupt_file.cr create mode 100644 src/errors/missing_chapter.cr diff --git a/.gitignore b/.gitignore index 0bb75ea..dd0aa52 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ /bin/ /.shards/ *.dwarf +*.pdf \ No newline at end of file diff --git a/shard.lock b/shard.lock index 087059c..9984f5f 100644 --- a/shard.lock +++ b/shard.lock @@ -2,7 +2,7 @@ version: 1.0 shards: crest: github: mamantoha/crest - version: 0.24.0 + version: 0.24.1 http-client-digest_auth: github: mamantoha/http-client-digest_auth diff --git a/src/errors/corrupt_file.cr b/src/errors/corrupt_file.cr new file mode 100644 index 0000000..a13477d --- /dev/null +++ b/src/errors/corrupt_file.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class CorruptFile < Exception + end +end diff --git a/src/errors/missing_chapter.cr b/src/errors/missing_chapter.cr new file mode 100644 index 0000000..b33487f --- /dev/null +++ b/src/errors/missing_chapter.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class MissingChapter < Exception + end +end diff --git a/src/fetch.cr b/src/fetch.cr index 7ddc636..840a4c8 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -4,9 +4,7 @@ require "./errors/*" module Muse::Dl class Fetch USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" - # TODO: Add support for cookies? - # "Cookie" => "session=124.123.104.8.1585388207021325", - HEADERS = { + HEADERS = { "User-Agent" => USER_AGENT, "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language" => "en-US,en;q=0.5", @@ -17,27 +15,42 @@ module Muse::Dl "Cache-Control" => "max-age=0", } - def self.save_chapter(tmp_path : String, chapter_id : String, add_bookmark = true) + def self.chapter_file_name(id : String, tmp_path : String) + "#{tmp_path}/chapter-#{id}.pdf" + end + + def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, add_bookmark = true) + final_pdf_file = chapter_file_name chapter_id, tmp_path + tmp_pdf_file = "#{final_pdf_file}.tmp" + + if File.exists? final_pdf_file + puts "#{chapter_id} already downloaded" + return + end + url = "https://muse.jhu.edu/chapter/#{chapter_id}" headers = HEADERS.merge({ "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", }) - begin - Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response| - File.open("#{tmp_path}/chapter-#{chapter_id}.pdf", "w") do |file| - IO.copy(response.body_io, file) - end - rescue e : Exception - puts e.message - raise e - # We catch a temporary redirect - # https://github.com/mamantoha/crest/blob/29a690726902c71884f9c80f0f9565256e74b7fd/src/crest/exceptions.cr#L20-L28 + Crest.get(url, max_redirects: 0, handle_errors: false, headers: headers) do |response| + File.open(tmp_pdf_file, "w") do |file| + IO.copy(response.body_io, file) end - rescue e : Exception - puts "FICK" - raise e end + + pdftk = Muse::Dl::Pdftk.new tmp_path + + pdftk.strip_first_page tmp_pdf_file + + if add_bookmark + # Run pdftk and add the bookmark to the file + pdftk.add_bookmark tmp_pdf_file, chapter_title + end + + # Now we can move the file to the proper PDF filename + File.rename tmp_pdf_file, final_pdf_file + puts "Downloaded #{chapter_id}" end def self.get_info(url : String) : Muse::Dl::Thing | Nil diff --git a/src/muse-dl.cr b/src/muse-dl.cr index a3f3807..51e63d7 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -16,8 +16,12 @@ module Muse::Dl if thing.is_a? Muse::Dl::Book thing.chapters.each do |chapter| - Fetch.save_chapter(parser.tmp, chapter[0]) + Fetch.save_chapter(parser.tmp, chapter[0], chapter[1], parser.bookmarks) end + chapter_ids = thing.chapters.map { |c| c[0] } + pdf_builder = Pdftk.new(parser.tmp) + pdf_builder.stitch(parser.output, chapter_ids) + puts "Saved final output to #{parser.output}" end end end diff --git a/src/pdftk.cr b/src/pdftk.cr index 7eadd73..f4fe62a 100644 --- a/src/pdftk.cr +++ b/src/pdftk.cr @@ -1,18 +1,69 @@ require "process" +require "file" +require "./fetch" +require "./errors/*" module Muse::Dl class Pdftk PDFTK_BINARY_NAME = "pdftk" - @binary : String | Nil + @binary = "/usr/sbin/pdftk" + @tmp_file_path : String getter :binary - def initialize - @binary = Process.find_executable(Pdftk::PDFTK_BINARY_NAME) - if !@binary + def initialize(tmp_file_path : String) + @tmp_file_path = tmp_file_path + + possible_binary = Process.find_executable(Pdftk::PDFTK_BINARY_NAME) + if possible_binary + @binary = possible_binary + else puts "Could not find pdftk binary, exiting" Process.exit(1) end end + + def execute(args : Array(String)) + Process.run(@binary, args) + end + + def strip_first_page(input_file : String) + output_pdf = File.tempfile("muse-dl-temp", ".pdf") + execute [input_file, "cat", "2-end", "output", output_pdf.path] + File.rename output_pdf.path, input_file + end + + def add_bookmark(input_file : String, title : String) + output_pdf = File.tempfile("muse-dl-temp", ".pdf") + bookmark_text_file = File.tempfile("muse-dl-chapter-tmp", ".txt") + bookmark_text = <<-END +BookmarkBegin +BookmarkTitle: #{title} +BookmarkLevel: 1 +BookmarkPageNumber: 1 +END + File.write(bookmark_text_file.path, bookmark_text) + execute [input_file, "update_info", bookmark_text_file.path, "output", output_pdf.path] + + # Cleanup + bookmark_text_file.delete + File.rename output_pdf.path, input_file + end + + def stitch(output_file : String, chapter_ids : Array(String)) + # Do some sanity checks on each Chapter PDF + chapter_ids.each do |id| + raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path) + raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0 + end + + # Now let's stitch them together + + chapter_files = chapter_ids.map { |id| Fetch.chapter_file_name(id, @tmp_file_path) } + args = chapter_files + ["cat", "output", output_file] + execute args + + # TODO: Validate final file here + end end end