🏡 index : github.com/captn3m0/muse-dl.git

author Nemo <me@captnemo.in> 2020-06-30 18:36:01.0 +05:30:00
committer Nemo <me@captnemo.in> 2020-06-30 18:36:01.0 +05:30:00
commit
03fccde754606a5d520e9bba6d215c7450898cd4 [patch]
tree
049ad7a918a214ed5c1add95beb2ea5762300650
parent
3a2d45fb6ee2f727e5f839f1323bbff53a81691f
download
03fccde754606a5d520e9bba6d215c7450898cd4.tar.gz

Adds support for final journal downloads



Diff

 src/fetch.cr      |  5 +++++
 src/infoparser.cr |  4 ++++
 src/issue.cr      | 30 +++++++++++++++++++++++++++---
 src/journal.cr    |  8 ++++++--
 src/muse-dl.cr    | 26 ++++++++++++++++++++------
 src/parser.cr     |  4 ++++
 6 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/src/fetch.cr b/src/fetch.cr
index 963c894..213bdc5 100644
--- a/src/fetch.cr
+++ a/src/fetch.cr
@@ -27,6 +27,11 @@
      File.delete(fns) if File.exists?(fns)
    end

    def self.cleanup_articles(tmp_path : String, id : String)
      fns = article_file_name(id, tmp_path)
      File.delete(fns) if File.exists?(fns)
    end

    def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true)
      tmp_pdf_file = "#{file_name}.tmp"
      if File.exists? file_name
diff --git a/src/infoparser.cr b/src/infoparser.cr
index 8f10d28..7e331db 100644
--- a/src/infoparser.cr
+++ a/src/infoparser.cr
@@ -42,6 +42,10 @@
      end
    end

    def self.journal_title(myhtml : Myhtml::Parser)
      myhtml.css("#journal_about_info .title").map(&.inner_text).to_a[0].strip
    end

    def self.author(myhtml : Myhtml::Parser)
      myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("<BR>", ", ").gsub("\n", " ")
    end
diff --git a/src/issue.cr b/src/issue.cr
index 8f759d2..73a83fb 100644
--- a/src/issue.cr
+++ a/src/issue.cr
@@ -16,6 +16,8 @@
      date : String | Nil,
      journal_title : String | Nil

    setter :journal_title

    def initialize(id : String, response : String | Nil = nil)
      @id = id
      @url = "https://muse.jhu.edu/issue/#{id}"
@@ -29,6 +31,11 @@
        return @info["Open Access"] == "Yes"
      end
      false
    end

    def parse
      html = Crest.get(@url).to_s
      parse(html)
    end

    def parse(html : String)
@@ -47,25 +54,30 @@
        @volume = /Volume (\d+)/.match(t).try &.[1]
        @number = /Number (\d+)/.match(t).try &.[1]
        @number = /Issue (\d+)/.match(t).try &.[1] unless @number
        @date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1]
        @date = /((January|February|March|April|May|June|July|August|September|October|November|December|Sring|Winter|Fall|Summer) (\d+))/.match(t).try &.[1]
        @date = /(\d{4})/.match(t).try &.[1] unless @date
      end
    end

    def parse_contents(myhtml : Myhtml::Parser)
      journal_title_a = myhtml.css("#journal_banner_title a").first
      if journal_title_a
        @journal_title = journal_title_a.inner_text
      unless @journal_title
        journal_title_a = myhtml.css("#journal_banner_title a").first
        if journal_title_a
          @journal_title = journal_title_a.inner_text
        end
      end
      myhtml.css(".articles_list_text ol").each do |ol|
        link = ol.css("li.title a").first
        title = link.inner_text

        pages = ol.css("li.pg").first.try &.inner_text
        matches = /(\d+)-(\d+)/.match pages
        if matches
          start_page = matches[1].to_i
          end_page = matches[2].to_i
        pages = ol.css("li.pg")
        if pages.size > 0
          p = pages.first.try &.inner_text
          matches = /(\d+)-(\d+)/.match p
          if matches
            start_page = matches[1].to_i
            end_page = matches[2].to_i
          end
        end

        ol.css("a").each do |l|
diff --git a/src/journal.cr b/src/journal.cr
index 45b6214..d431824 100644
--- a/src/journal.cr
+++ a/src/journal.cr
@@ -1,13 +1,14 @@
require "./infoparser.cr"
require "./issue.cr"

module Muse::Dl
  class Journal
    getter :info, :summary, :publisher, :issues
    getter :info, :summary, :publisher, :issues, :title
    @info = Hash(String, String).new
    @summary : String
    @publisher : String
    @issues = [] of Muse::Dl::Issue
    @title : String

    private getter :h

@@ -16,6 +17,7 @@
      @info = InfoParser.infobox(h)
      @summary = InfoParser.summary(h)
      @publisher = InfoParser.journal_publisher(h)
      @title = InfoParser.journal_title(h)
      parse_volumes(h)
    end

@@ -32,7 +34,9 @@

        matches = /\/issue\/(\d+)/.match link
        if matches
          @issues.push Muse::Dl::Issue.new matches[1]
          issue = Muse::Dl::Issue.new matches[1]
          issue.journal_title = @title
          @issues.push issue
        end
      end
    end
diff --git a/src/muse-dl.cr b/src/muse-dl.cr
index 47095ab..19693a8 100644
--- a/src/muse-dl.cr
+++ a/src/muse-dl.cr
@@ -12,6 +12,7 @@
  class Main
    def self.dl(parser : Parser)
      url = parser.url
      puts "Downloading #{url}"
      thing = Fetch.get_info(url) if url
      return unless thing

@@ -78,7 +79,7 @@
        FileUtils.rm source if parser.cleanup
      elsif thing.is_a? Muse::Dl::Issue
        # Will have no effect if parser has a custom title
        parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"
        parser.force_set_output Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"

        # If file exists and we can't clobber
        if File.exists?(parser.output) && parser.clobber == false
@@ -88,7 +89,6 @@
        temp_stitched_file = nil
        pdf_builder = Pdftk.new(parser.tmp)

        # ## TODO till 111
        thing.articles.each do |article|
          begin
            Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
@@ -101,21 +101,25 @@

        # Stitch the PDFs together
        temp_stitched_file = pdf_builder.stitch_articles article_ids
        # TODO: Add metadata for each Issue
        pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)

        # temp_stitched_file.delete if temp_stitched_file
        puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
        puts "DL: #{url}. Saved final output to #{parser.output}"

        # Cleanup the chapter files
        # TODO
        # if parser.cleanup
        #   thing.articles.each do |c|
        #     Fetch.cleanup(parser.tmp, c[0])
        #   end
        # end
        ####
        # Cleanup the issue files
        if parser.cleanup
          thing.articles.each do |a|
            Fetch.cleanup_articles(parser.tmp, a.id)
          end
        end
      elsif thing.is_a? Muse::Dl::Journal
        thing.issues.each do |issue|
          # Update the issue
          issue.parse
          parser.url = issue.url
          Main.dl parser
        end
      end
    end

diff --git a/src/parser.cr b/src/parser.cr
index a98d27e..e462445 100644
--- a/src/parser.cr
+++ a/src/parser.cr
@@ -26,6 +26,10 @@
      @output = output_file unless @output != DEFAULT_FILE_NAME
    end

    def force_set_output(output_file : String)
      @output = output_file
    end

    def reset_output_file
      @output = DEFAULT_FILE_NAME
    end