Minor fixes, parse contents for issues

This commit is contained in:
Nemo 2020-06-30 14:08:28 +05:30
parent aa392eaa64
commit 04a2fe52ec
4 changed files with 16 additions and 2 deletions

View File

@ -1,4 +1,5 @@
require "./spec_helper"
require "webmock"
# require "errors/muse_corrupt_pdf.cr"
describe Muse::Dl::Book do

View File

@ -63,7 +63,6 @@ module Muse::Dl
content_type = response.headers["Content-Type"]
if content_type.is_a? String
if /html/.match content_type
puts response
response.body.each_line do |line|
# https://muse.jhu.edu/chapter/2383438/pdf
# https://muse.jhu.edu/book/67393

View File

@ -14,6 +14,7 @@ module Muse::Dl
@volume : String | Nil
@number : String | Nil
@date : String | Nil
@issues : Array(Muse::Dl::Issue)
getter :id, :title, :articles, :url, :summary, :publisher, :info, :volume, :number, :date
@ -22,6 +23,7 @@ module Muse::Dl
@url = "https://muse.jhu.edu/issue/#{id}"
@info = Hash(String, String).new
@articles = [] of Muse::Dl::Article
@issues = [] of Muse::Dl::Issue
end
def parse
@ -32,6 +34,7 @@ module Muse::Dl
@summary = InfoParser.summary(h)
@publisher = InfoParser.journal_publisher(h)
parse_title
parse_contents(h)
end
def parse_title
@ -42,5 +45,16 @@ module Muse::Dl
@date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1]
end
end
def parse_contents(myhtml : Myhtml::Parser)
myhtml.css("#available_issues_list_text a").each do |a|
link = a.attribute_by("href").to_s
matches = /\/issue\/(\d+)/.match link
if matches
@issues.push Muse::Dl::Issue.new matches[1]
end
end
end
end
end

View File

@ -51,7 +51,7 @@ module Muse::Dl
end
temp_stitched_file.delete if temp_stitched_file
puts "--dont-strip-first-page was on. Please validate PDF file for any errors."
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first
puts "DL: #{url}. Saved final output to #{parser.output}"
# Cleanup the chapter files