Adds support for final journal downloads

This commit is contained in:
Nemo 2020-06-30 18:36:01 +05:30
parent 3a2d45fb6e
commit 03fccde754
6 changed files with 55 additions and 22 deletions

View File

@ -27,6 +27,11 @@ module Muse::Dl
File.delete(fns) if File.exists?(fns) File.delete(fns) if File.exists?(fns)
end end
def self.cleanup_articles(tmp_path : String, id : String)
fns = article_file_name(id, tmp_path)
File.delete(fns) if File.exists?(fns)
end
def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true) def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true)
tmp_pdf_file = "#{file_name}.tmp" tmp_pdf_file = "#{file_name}.tmp"
if File.exists? file_name if File.exists? file_name

View File

@ -42,6 +42,10 @@ module Muse::Dl
end end
end end
def self.journal_title(myhtml : Myhtml::Parser)
myhtml.css("#journal_about_info .title").map(&.inner_text).to_a[0].strip
end
def self.author(myhtml : Myhtml::Parser) def self.author(myhtml : Myhtml::Parser)
myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("<BR>", ", ").gsub("\n", " ") myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("<BR>", ", ").gsub("\n", " ")
end end

View File

@ -16,6 +16,8 @@ module Muse::Dl
date : String | Nil, date : String | Nil,
journal_title : String | Nil journal_title : String | Nil
setter :journal_title
def initialize(id : String, response : String | Nil = nil) def initialize(id : String, response : String | Nil = nil)
@id = id @id = id
@url = "https://muse.jhu.edu/issue/#{id}" @url = "https://muse.jhu.edu/issue/#{id}"
@ -31,6 +33,11 @@ module Muse::Dl
false false
end end
def parse
html = Crest.get(@url).to_s
parse(html)
end
def parse(html : String) def parse(html : String)
h = Myhtml::Parser.new html h = Myhtml::Parser.new html
@info = InfoParser.infobox(h) @info = InfoParser.infobox(h)
@ -47,25 +54,30 @@ module Muse::Dl
@volume = /Volume (\d+)/.match(t).try &.[1] @volume = /Volume (\d+)/.match(t).try &.[1]
@number = /Number (\d+)/.match(t).try &.[1] @number = /Number (\d+)/.match(t).try &.[1]
@number = /Issue (\d+)/.match(t).try &.[1] unless @number @number = /Issue (\d+)/.match(t).try &.[1] unless @number
@date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1] @date = /((January|February|March|April|May|June|July|August|September|October|November|December|Sring|Winter|Fall|Summer) (\d+))/.match(t).try &.[1]
@date = /(\d{4})/.match(t).try &.[1] unless @date @date = /(\d{4})/.match(t).try &.[1] unless @date
end end
end end
def parse_contents(myhtml : Myhtml::Parser) def parse_contents(myhtml : Myhtml::Parser)
journal_title_a = myhtml.css("#journal_banner_title a").first unless @journal_title
if journal_title_a journal_title_a = myhtml.css("#journal_banner_title a").first
@journal_title = journal_title_a.inner_text if journal_title_a
@journal_title = journal_title_a.inner_text
end
end end
myhtml.css(".articles_list_text ol").each do |ol| myhtml.css(".articles_list_text ol").each do |ol|
link = ol.css("li.title a").first link = ol.css("li.title a").first
title = link.inner_text title = link.inner_text
pages = ol.css("li.pg").first.try &.inner_text pages = ol.css("li.pg")
matches = /(\d+)-(\d+)/.match pages if pages.size > 0
if matches p = pages.first.try &.inner_text
start_page = matches[1].to_i matches = /(\d+)-(\d+)/.match p
end_page = matches[2].to_i if matches
start_page = matches[1].to_i
end_page = matches[2].to_i
end
end end
ol.css("a").each do |l| ol.css("a").each do |l|

View File

@ -3,11 +3,12 @@ require "./issue.cr"
module Muse::Dl module Muse::Dl
class Journal class Journal
getter :info, :summary, :publisher, :issues getter :info, :summary, :publisher, :issues, :title
@info = Hash(String, String).new @info = Hash(String, String).new
@summary : String @summary : String
@publisher : String @publisher : String
@issues = [] of Muse::Dl::Issue @issues = [] of Muse::Dl::Issue
@title : String
private getter :h private getter :h
@ -16,6 +17,7 @@ module Muse::Dl
@info = InfoParser.infobox(h) @info = InfoParser.infobox(h)
@summary = InfoParser.summary(h) @summary = InfoParser.summary(h)
@publisher = InfoParser.journal_publisher(h) @publisher = InfoParser.journal_publisher(h)
@title = InfoParser.journal_title(h)
parse_volumes(h) parse_volumes(h)
end end
@ -32,7 +34,9 @@ module Muse::Dl
matches = /\/issue\/(\d+)/.match link matches = /\/issue\/(\d+)/.match link
if matches if matches
@issues.push Muse::Dl::Issue.new matches[1] issue = Muse::Dl::Issue.new matches[1]
issue.journal_title = @title
@issues.push issue
end end
end end
end end

View File

@ -12,6 +12,7 @@ module Muse::Dl
class Main class Main
def self.dl(parser : Parser) def self.dl(parser : Parser)
url = parser.url url = parser.url
puts "Downloading #{url}"
thing = Fetch.get_info(url) if url thing = Fetch.get_info(url) if url
return unless thing return unless thing
@ -78,7 +79,7 @@ module Muse::Dl
FileUtils.rm source if parser.cleanup FileUtils.rm source if parser.cleanup
elsif thing.is_a? Muse::Dl::Issue elsif thing.is_a? Muse::Dl::Issue
# Will have no effect if parser has a custom title # Will have no effect if parser has a custom title
parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf" parser.force_set_output Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"
# If file exists and we can't clobber # If file exists and we can't clobber
if File.exists?(parser.output) && parser.clobber == false if File.exists?(parser.output) && parser.clobber == false
@ -88,7 +89,6 @@ module Muse::Dl
temp_stitched_file = nil temp_stitched_file = nil
pdf_builder = Pdftk.new(parser.tmp) pdf_builder = Pdftk.new(parser.tmp)
# ## TODO till 111
thing.articles.each do |article| thing.articles.each do |article|
begin begin
Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first) Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
@ -101,21 +101,25 @@ module Muse::Dl
# Stitch the PDFs together # Stitch the PDFs together
temp_stitched_file = pdf_builder.stitch_articles article_ids temp_stitched_file = pdf_builder.stitch_articles article_ids
# TODO: Add metadata for each Issue
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
# temp_stitched_file.delete if temp_stitched_file # temp_stitched_file.delete if temp_stitched_file
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
puts "DL: #{url}. Saved final output to #{parser.output}" puts "DL: #{url}. Saved final output to #{parser.output}"
# Cleanup the chapter files # Cleanup the issue files
# TODO if parser.cleanup
# if parser.cleanup thing.articles.each do |a|
# thing.articles.each do |c| Fetch.cleanup_articles(parser.tmp, a.id)
# Fetch.cleanup(parser.tmp, c[0]) end
# end end
# end elsif thing.is_a? Muse::Dl::Journal
#### thing.issues.each do |issue|
# Update the issue
issue.parse
parser.url = issue.url
Main.dl parser
end
end end
end end

View File

@ -26,6 +26,10 @@ module Muse::Dl
@output = output_file unless @output != DEFAULT_FILE_NAME @output = output_file unless @output != DEFAULT_FILE_NAME
end end
def force_set_output(output_file : String)
@output = output_file
end
def reset_output_file def reset_output_file
@output = DEFAULT_FILE_NAME @output = DEFAULT_FILE_NAME
end end