🏡 index : github.com/captn3m0/muse-dl.git

author Nemo <me@captnemo.in> 2020-06-30 17:36:44.0 +05:30:00
committer Nemo <me@captnemo.in> 2020-06-30 17:36:44.0 +05:30:00
commit
62e6a21c84695786e64f7aa8ab51866e2e5c99a7 [patch]
tree
658a219341aeb6e7cd54c42f4ba7008e6cfe81d2
parent
38db0dd000fd62b77be6fdc0b66014bf3e238ebd
download
62e6a21c84695786e64f7aa8ab51866e2e5c99a7.tar.gz

Finishes support for downloading complete issues



Diff

 src/fetch.cr                  |  2 +-
 src/infoparser.cr             |  6 +++++-
 src/issue.cr                  |  8 ++++----
 src/muse-dl.cr                | 28 ++++++++++++++++++----------
 src/pdftk.cr                  | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 src/errors/missing_chapter.cr |  4 ----
 src/errors/missing_file.cr    |  4 ++++
 7 files changed, 107 insertions(+), 31 deletions(-)

diff --git a/src/fetch.cr b/src/fetch.cr
index ef15d29..963c894 100644
--- a/src/fetch.cr
+++ a/src/fetch.cr
@@ -132,7 +132,7 @@
          when "journal"
            return Muse::Dl::Journal.new response
          when "issue"
            return Muse::Dl::Issue.new response
            return Muse::Dl::Issue.new match[2], response
          when "article"
            return Muse::Dl::Article.new match[2]
          end
diff --git a/src/infoparser.cr b/src/infoparser.cr
index 836c9e3..8f10d28 100644
--- a/src/infoparser.cr
+++ a/src/infoparser.cr
@@ -35,7 +35,11 @@
    end

    def self.issue_title(myhtml : Myhtml::Parser)
      myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
      begin
        myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
      rescue
        nil
      end
    end

    def self.author(myhtml : Myhtml::Parser)
diff --git a/src/issue.cr b/src/issue.cr
index 4a0c97a..22c8dae 100644
--- a/src/issue.cr
+++ a/src/issue.cr
@@ -16,15 +16,15 @@
      date : String | Nil,
      journal_title : String | Nil

    def initialize(id : String)
    def initialize(id : String, response : String | Nil = nil)
      @id = id
      @url = "https://muse.jhu.edu/issue/#{id}"
      @info = Hash(String, String).new
      @articles = [] of Muse::Dl::Article
      parse(response) if response
      @info = Hash(String, String).new
    end

    def parse
      html = Crest.get(url).to_s
    def parse(html : String)
      h = Myhtml::Parser.new html
      @info = InfoParser.infobox(h)
      @title = InfoParser.issue_title(h)
diff --git a/src/muse-dl.cr b/src/muse-dl.cr
index 656dab1..07e6f25 100644
--- a/src/muse-dl.cr
+++ a/src/muse-dl.cr
@@ -47,7 +47,7 @@
        pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)

        temp_stitched_file.delete if temp_stitched_file
        puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
        puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
        puts "DL: #{url}. Saved final output to #{parser.output}"

        # Cleanup the chapter files
@@ -73,7 +73,7 @@
        FileUtils.rm source if parser.cleanup
      elsif thing.is_a? Muse::Dl::Issue
        # Will have no effect if parser has a custom title
        parser.output = Util.slug_filename "#{thing.title}.pdf"
        parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"

        # If file exists and we can't clobber
        if File.exists?(parser.output) && parser.clobber == false
@@ -84,30 +84,32 @@
        pdf_builder = Pdftk.new(parser.tmp)

        # ## TODO till 111
        thing.issues.each do |issue|
        thing.articles.each do |article|
          begin
            Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first)
            Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
          rescue e : Muse::Dl::Errors::MuseCorruptPDF
            STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
            return
          end
        end
        chapter_ids = thing.chapters.map { |c| c[0] }
        article_ids = thing.articles.map { |a| a.id }

        # Stitch the PDFs together
        temp_stitched_file = pdf_builder.stitch chapter_ids
        temp_stitched_file = pdf_builder.stitch_articles article_ids
        # TODO: Add metadata for each Issue
        pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)

        temp_stitched_file.delete if temp_stitched_file
        puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
        # temp_stitched_file.delete if temp_stitched_file
        puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
        puts "DL: #{url}. Saved final output to #{parser.output}"

        # Cleanup the chapter files
        if parser.cleanup
          thing.chapters.each do |c|
            Fetch.cleanup(parser.tmp, c[0])
          end
        end
        # TODO
        # if parser.cleanup
        #   thing.articles.each do |c|
        #     Fetch.cleanup(parser.tmp, c[0])
        #   end
        # end
        ####
      end
    end
diff --git a/src/pdftk.cr b/src/pdftk.cr
index 4b5a01c..41ebcf0 100644
--- a/src/pdftk.cr
+++ a/src/pdftk.cr
@@ -70,7 +70,6 @@

    def add_metadata(input_file : File, output_file : String, book : Book)
      # First we have to dump the current metadata
      metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
      keywords = "Publisher:#{book.publisher}, Published:#{book.date}"

      # Known Info keys, if they are present
@@ -79,8 +78,13 @@
          keywords += ", #{label}:#{book.info[label]}"
        end
      end

      metadata_text = gen_metadata(book.title, keywords, book.summary.gsub(/\n\s+/, " "), book.author)
      write_metadata(input_file, output_file, metadata_text)
    end

      text = <<-EOT

    def gen_metadata(title : String, keywords : String, subject : String, author : String | Nil = nil)
      metadata = <<-EOT

      InfoBegin
      InfoKey: Creator
      InfoValue:
@@ -89,37 +93,80 @@
      InfoValue:
      InfoBegin
      InfoKey: Title
      InfoValue: #{book.title}
      InfoValue: #{title}
      InfoBegin
      InfoKey: Keywords
      InfoValue: #{keywords}
      InfoBegin
      InfoKey: Author
      InfoValue: #{book.author}
      InfoBegin
      InfoKey: Subject
      InfoValue: #{book.summary.gsub(/\n\s+/, " ")}
      InfoValue: #{subject}
      InfoBegin
      InfoKey: ModDate
      InfoValue:
      InfoBegin
      InfoKey: CreationDate
      InfoValue:

      EOT

      unless author.nil?
        metadata += <<-EOT

        InfoBegin
        InfoKey: Author
        InfoValue: #{author}
        EOT
      end

      return metadata
    end

    def write_metadata(input_file : File, output_file : String, text)
      metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
      File.write(metadata_text_file.path, text)

      is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
      if !is_success
        raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.")
      end
      metadata_text_file.delete
    end

    def add_metadata(input_file : File, output_file : String, issue : Issue)
      # First we have to dump the current metadata
      metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
      keywords = "Journal:#{issue.journal_title}, Published:#{issue.date},Volume:#{issue.volume},Number:#{issue.number}"
      ["ISSN", "Print ISSN", "DOI", "Language", "Open Access"].each do |label|
        if issue.info.has_key? label
          keywords += ", #{label}:#{issue.info[label]}"
        end
      end

      # TODO: Move this to Issue class

      s = issue.summary
      unless s.nil?
        summary = s.gsub(/\n\s+/, " ")
      else
        summary = "NA"
      end

      t = issue.title

      unless t.nil?
        title = t
      else
        title = "NA"
      end
      # TODO: Add support for all authors in the PDF
      metadata = gen_metadata(title, keywords, summary)
      write_metadata(input_file, output_file, metadata)
    end

    def stitch(chapter_ids : Array(String))
      output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
      # Do some sanity checks on each Chapter PDF
      chapter_ids.each do |id|
        raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
        raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
        raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
      end

@@ -132,6 +179,29 @@
      # TODO: Validate final file here
      if !is_success
        raise Muse::Dl::Errors::PDFOperationError.new("Error stitching chapters together.")
      end

      return output_file
    end

    # TODO: Merge with stitch
    def stitch_articles(article_ids : Array(String))
      output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
      # Do some sanity checks on each Chapter PDF
      article_ids.each do |id|
        raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.article_file_name(id, @tmp_file_path)
        raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.article_file_name(id, @tmp_file_path)) > 0
      end

      # Now let's stitch them together
      article_files = article_ids.map { |id| Fetch.article_file_name(id, @tmp_file_path) }
      args = article_files + ["cat", "output", output_file.path]
      is_success = execute args

      # TODO: Validate final file here
      if !is_success
        puts args
        raise Muse::Dl::Errors::PDFOperationError.new("Error stitching articles together.")
      end

      return output_file
diff --git a/src/errors/missing_chapter.cr b/src/errors/missing_chapter.cr
deleted file mode 100644
index b33487f..0000000 100644
--- a/src/errors/missing_chapter.cr
+++ /dev/null
@@ -1,4 +1,0 @@
module Muse::Dl::Errors
  class MissingChapter < Exception
  end
end
diff --git a/src/errors/missing_file.cr b/src/errors/missing_file.cr
new file mode 100644
index 0000000..f11f6fd 100644
--- /dev/null
+++ a/src/errors/missing_file.cr
@@ -1,0 +1,4 @@
module Muse::Dl::Errors
  class MissingFile < Exception
  end
end