Fixes parser for issue HTML

This also adds .journal_title as an attribute to the Issue object
2020-06-30 15:19:12 +05:30 · 2020-06-30 15:19:12 +05:30 · 919c8ac43f
parent 870ed3080d
commit 919c8ac43f
5 changed files with 1361 additions and 21 deletions
--- a/spec/fixtures/issue-35852.html
+++ b/spec/fixtures/issue-35852.html
--- a/spec/issue_spec.cr
+++ b/spec/issue_spec.cr
@ -37,4 +37,25 @@ describe Muse::Dl::Issue do
  it "should parse publisher" do
    issue.publisher.should eq "Johns Hopkins University Press"
  end
  it "should parse the journal title" do
    issue.journal_title.should eq "portal: Libraries and the Academy"
  end
  it "should parse non-numbered issues" do
    WebMock.stub(:get, "https://muse.jhu.edu/issue/35852")
      .to_return(body: File.new("spec/fixtures/issue-35852.html").gets_to_end)
    issue = Muse::Dl::Issue.new "35852"
    issue.parse
    issue.volume.should eq "1"
    issue.number.should eq "2"
    issue.date.should eq "2016"
    issue.info["ISSN"].should eq "2474-9419"
    issue.info["Print ISSN"].should eq "2474-9427"
    issue.info["Launched on MUSE"].should eq "2017-02-21"
    issue.info["Open Access"].should eq "Yes"
    issue.title.should eq "Volume 1, Issue 2, 2016"
    issue.journal_title.should eq "Constitutional Studies"
  end
 end
--- a/src/article.cr
+++ b/src/article.cr
@ -3,8 +3,8 @@ require "./issue.cr"
 module Muse::Dl
  class Article
-    @id : String
+    getter id : String
-    getter :id
+    setter title : String | Nil, start_page : Int32 | Nil, end_page : Int32 | Nil
    def initialize(id : String)
      @id = id
--- a/src/issue.cr
+++ b/src/issue.cr
@ -1,29 +1,26 @@
-require "./thing.cr"
+"./thing.cr"
 require "./fetch.cr"
 require "./article.cr"
 module Muse::Dl
  class Issue
-    @id : String
+    getter id : String,
-    @title : String | Nil
+      title : String | Nil,
-    @articles : Array(Muse::Dl::Article)
+      articles : Array(Muse::Dl::Article),
-    @url : String
+      url : String,
-    @info : Hash(String, String)
+      summary : String | Nil,
-    @summary : String | Nil
+      publisher : String | Nil,
-    @publisher : String | Nil
+      info : Hash(String, String),
-    @volume : String | Nil
+      volume : String | Nil,
-    @number : String | Nil
+      number : String | Nil,
-    @date : String | Nil
+      date : String | Nil,
-    @issues : Array(Muse::Dl::Issue)
+      journal_title : String | Nil
    getter :id, :title, :articles, :url, :summary, :publisher, :info, :volume, :number, :date
    def initialize(id : String)
      @id = id
      @url = "https://muse.jhu.edu/issue/#{id}"
      @info = Hash(String, String).new
      @articles = [] of Muse::Dl::Article
      @issues = [] of Muse::Dl::Issue
    end
    def parse
@ -42,17 +39,38 @@ module Muse::Dl
      unless t.nil?
        @volume = /Volume (\d+)/.match(t).try &.[1]
        @number = /Number (\d+)/.match(t).try &.[1]
        @number = /Issue (\d+)/.match(t).try &.[1] unless @number
        @date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1]
        @date = /(\d{4})/.match(t).try &.[1] unless @date
      end
    end
    def parse_contents(myhtml : Myhtml::Parser)
-      myhtml.css("#available_issues_list_text a").each do |a|
+      journal_title_a = myhtml.css("#journal_banner_title a").first
-        link = a.attribute_by("href").to_s
+      if journal_title_a
        @journal_title = journal_title_a.inner_text
      end
      myhtml.css(".articles_list_text ol").each do |ol|
        link = ol.css("li.title a").first
        title = link.inner_text
-        matches = /\/issue\/(\d+)/.match link
+        pages = ol.css("li.pg").first.try &.inner_text
        matches = /(\d+)-(\d+)/.match pages
        if matches
-          @issues.push Muse::Dl::Issue.new matches[1]
+          start_page = matches[1].to_i
          end_page = matches[2].to_i
        end
        ol.css("a").each do |l|
          url = l.attribute_by("href").to_s
          matches = /\/article\/(\d+)\/pdf/.match url
          if matches
            a = Muse::Dl::Article.new matches[1]
            a.title = title
            a.start_page = start_page if start_page
            a.end_page = end_page if end_page
            @articles.push a
          end
        end
      end
    end
--- a/src/muse-dl.cr
+++ b/src/muse-dl.cr
@ -71,6 +71,44 @@ module Muse::Dl
        # Needed because of https://github.com/crystal-lang/crystal/issues/7777
        FileUtils.cp source, destination
        FileUtils.rm source if parser.cleanup
      elsif thing.is_a? Muse::Dl::Issue
        # Will have no effect if parser has a custom title
        parser.output = Util.slug_filename "#{thing.title}.pdf"
        # If file exists and we can't clobber
        if File.exists?(parser.output) && parser.clobber == false
          STDERR.puts "Skipping #{url}, File already exists: #{parser.output}"
          return
        end
        temp_stitched_file = nil
        pdf_builder = Pdftk.new(parser.tmp)
        # ## TODO till 111
        thing.issues.each do |issue|
          begin
            Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first)
          rescue e : Muse::Dl::Errors::MuseCorruptPDF
            STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
            return
          end
        end
        chapter_ids = thing.chapters.map { |c| c[0] }
        # Stitch the PDFs together
        temp_stitched_file = pdf_builder.stitch chapter_ids
        pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
        temp_stitched_file.delete if temp_stitched_file
        puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
        puts "DL: #{url}. Saved final output to #{parser.output}"
        # Cleanup the chapter files
        if parser.cleanup
          thing.chapters.each do |c|
            Fetch.cleanup(parser.tmp, c[0])
          end
        end
        ####
      end
    end