mirror of https://github.com/captn3m0/muse-dl.git
Adds support for final journal downloads
This commit is contained in:
parent
3a2d45fb6e
commit
03fccde754
|
@ -27,6 +27,11 @@ module Muse::Dl
|
||||||
File.delete(fns) if File.exists?(fns)
|
File.delete(fns) if File.exists?(fns)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.cleanup_articles(tmp_path : String, id : String)
|
||||||
|
fns = article_file_name(id, tmp_path)
|
||||||
|
File.delete(fns) if File.exists?(fns)
|
||||||
|
end
|
||||||
|
|
||||||
def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true)
|
def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true)
|
||||||
tmp_pdf_file = "#{file_name}.tmp"
|
tmp_pdf_file = "#{file_name}.tmp"
|
||||||
if File.exists? file_name
|
if File.exists? file_name
|
||||||
|
|
|
@ -42,6 +42,10 @@ module Muse::Dl
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.journal_title(myhtml : Myhtml::Parser)
|
||||||
|
myhtml.css("#journal_about_info .title").map(&.inner_text).to_a[0].strip
|
||||||
|
end
|
||||||
|
|
||||||
def self.author(myhtml : Myhtml::Parser)
|
def self.author(myhtml : Myhtml::Parser)
|
||||||
myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("<BR>", ", ").gsub("\n", " ")
|
myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("<BR>", ", ").gsub("\n", " ")
|
||||||
end
|
end
|
||||||
|
|
30
src/issue.cr
30
src/issue.cr
|
@ -16,6 +16,8 @@ module Muse::Dl
|
||||||
date : String | Nil,
|
date : String | Nil,
|
||||||
journal_title : String | Nil
|
journal_title : String | Nil
|
||||||
|
|
||||||
|
setter :journal_title
|
||||||
|
|
||||||
def initialize(id : String, response : String | Nil = nil)
|
def initialize(id : String, response : String | Nil = nil)
|
||||||
@id = id
|
@id = id
|
||||||
@url = "https://muse.jhu.edu/issue/#{id}"
|
@url = "https://muse.jhu.edu/issue/#{id}"
|
||||||
|
@ -31,6 +33,11 @@ module Muse::Dl
|
||||||
false
|
false
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def parse
|
||||||
|
html = Crest.get(@url).to_s
|
||||||
|
parse(html)
|
||||||
|
end
|
||||||
|
|
||||||
def parse(html : String)
|
def parse(html : String)
|
||||||
h = Myhtml::Parser.new html
|
h = Myhtml::Parser.new html
|
||||||
@info = InfoParser.infobox(h)
|
@info = InfoParser.infobox(h)
|
||||||
|
@ -47,25 +54,30 @@ module Muse::Dl
|
||||||
@volume = /Volume (\d+)/.match(t).try &.[1]
|
@volume = /Volume (\d+)/.match(t).try &.[1]
|
||||||
@number = /Number (\d+)/.match(t).try &.[1]
|
@number = /Number (\d+)/.match(t).try &.[1]
|
||||||
@number = /Issue (\d+)/.match(t).try &.[1] unless @number
|
@number = /Issue (\d+)/.match(t).try &.[1] unless @number
|
||||||
@date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1]
|
@date = /((January|February|March|April|May|June|July|August|September|October|November|December|Sring|Winter|Fall|Summer) (\d+))/.match(t).try &.[1]
|
||||||
@date = /(\d{4})/.match(t).try &.[1] unless @date
|
@date = /(\d{4})/.match(t).try &.[1] unless @date
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def parse_contents(myhtml : Myhtml::Parser)
|
def parse_contents(myhtml : Myhtml::Parser)
|
||||||
journal_title_a = myhtml.css("#journal_banner_title a").first
|
unless @journal_title
|
||||||
if journal_title_a
|
journal_title_a = myhtml.css("#journal_banner_title a").first
|
||||||
@journal_title = journal_title_a.inner_text
|
if journal_title_a
|
||||||
|
@journal_title = journal_title_a.inner_text
|
||||||
|
end
|
||||||
end
|
end
|
||||||
myhtml.css(".articles_list_text ol").each do |ol|
|
myhtml.css(".articles_list_text ol").each do |ol|
|
||||||
link = ol.css("li.title a").first
|
link = ol.css("li.title a").first
|
||||||
title = link.inner_text
|
title = link.inner_text
|
||||||
|
|
||||||
pages = ol.css("li.pg").first.try &.inner_text
|
pages = ol.css("li.pg")
|
||||||
matches = /(\d+)-(\d+)/.match pages
|
if pages.size > 0
|
||||||
if matches
|
p = pages.first.try &.inner_text
|
||||||
start_page = matches[1].to_i
|
matches = /(\d+)-(\d+)/.match p
|
||||||
end_page = matches[2].to_i
|
if matches
|
||||||
|
start_page = matches[1].to_i
|
||||||
|
end_page = matches[2].to_i
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
ol.css("a").each do |l|
|
ol.css("a").each do |l|
|
||||||
|
|
|
@ -3,11 +3,12 @@ require "./issue.cr"
|
||||||
|
|
||||||
module Muse::Dl
|
module Muse::Dl
|
||||||
class Journal
|
class Journal
|
||||||
getter :info, :summary, :publisher, :issues
|
getter :info, :summary, :publisher, :issues, :title
|
||||||
@info = Hash(String, String).new
|
@info = Hash(String, String).new
|
||||||
@summary : String
|
@summary : String
|
||||||
@publisher : String
|
@publisher : String
|
||||||
@issues = [] of Muse::Dl::Issue
|
@issues = [] of Muse::Dl::Issue
|
||||||
|
@title : String
|
||||||
|
|
||||||
private getter :h
|
private getter :h
|
||||||
|
|
||||||
|
@ -16,6 +17,7 @@ module Muse::Dl
|
||||||
@info = InfoParser.infobox(h)
|
@info = InfoParser.infobox(h)
|
||||||
@summary = InfoParser.summary(h)
|
@summary = InfoParser.summary(h)
|
||||||
@publisher = InfoParser.journal_publisher(h)
|
@publisher = InfoParser.journal_publisher(h)
|
||||||
|
@title = InfoParser.journal_title(h)
|
||||||
parse_volumes(h)
|
parse_volumes(h)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -32,7 +34,9 @@ module Muse::Dl
|
||||||
|
|
||||||
matches = /\/issue\/(\d+)/.match link
|
matches = /\/issue\/(\d+)/.match link
|
||||||
if matches
|
if matches
|
||||||
@issues.push Muse::Dl::Issue.new matches[1]
|
issue = Muse::Dl::Issue.new matches[1]
|
||||||
|
issue.journal_title = @title
|
||||||
|
@issues.push issue
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -12,6 +12,7 @@ module Muse::Dl
|
||||||
class Main
|
class Main
|
||||||
def self.dl(parser : Parser)
|
def self.dl(parser : Parser)
|
||||||
url = parser.url
|
url = parser.url
|
||||||
|
puts "Downloading #{url}"
|
||||||
thing = Fetch.get_info(url) if url
|
thing = Fetch.get_info(url) if url
|
||||||
return unless thing
|
return unless thing
|
||||||
|
|
||||||
|
@ -78,7 +79,7 @@ module Muse::Dl
|
||||||
FileUtils.rm source if parser.cleanup
|
FileUtils.rm source if parser.cleanup
|
||||||
elsif thing.is_a? Muse::Dl::Issue
|
elsif thing.is_a? Muse::Dl::Issue
|
||||||
# Will have no effect if parser has a custom title
|
# Will have no effect if parser has a custom title
|
||||||
parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"
|
parser.force_set_output Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"
|
||||||
|
|
||||||
# If file exists and we can't clobber
|
# If file exists and we can't clobber
|
||||||
if File.exists?(parser.output) && parser.clobber == false
|
if File.exists?(parser.output) && parser.clobber == false
|
||||||
|
@ -88,7 +89,6 @@ module Muse::Dl
|
||||||
temp_stitched_file = nil
|
temp_stitched_file = nil
|
||||||
pdf_builder = Pdftk.new(parser.tmp)
|
pdf_builder = Pdftk.new(parser.tmp)
|
||||||
|
|
||||||
# ## TODO till 111
|
|
||||||
thing.articles.each do |article|
|
thing.articles.each do |article|
|
||||||
begin
|
begin
|
||||||
Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
|
Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
|
||||||
|
@ -101,21 +101,25 @@ module Muse::Dl
|
||||||
|
|
||||||
# Stitch the PDFs together
|
# Stitch the PDFs together
|
||||||
temp_stitched_file = pdf_builder.stitch_articles article_ids
|
temp_stitched_file = pdf_builder.stitch_articles article_ids
|
||||||
# TODO: Add metadata for each Issue
|
|
||||||
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
||||||
|
|
||||||
# temp_stitched_file.delete if temp_stitched_file
|
# temp_stitched_file.delete if temp_stitched_file
|
||||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
|
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
|
||||||
puts "DL: #{url}. Saved final output to #{parser.output}"
|
puts "DL: #{url}. Saved final output to #{parser.output}"
|
||||||
|
|
||||||
# Cleanup the chapter files
|
# Cleanup the issue files
|
||||||
# TODO
|
if parser.cleanup
|
||||||
# if parser.cleanup
|
thing.articles.each do |a|
|
||||||
# thing.articles.each do |c|
|
Fetch.cleanup_articles(parser.tmp, a.id)
|
||||||
# Fetch.cleanup(parser.tmp, c[0])
|
end
|
||||||
# end
|
end
|
||||||
# end
|
elsif thing.is_a? Muse::Dl::Journal
|
||||||
####
|
thing.issues.each do |issue|
|
||||||
|
# Update the issue
|
||||||
|
issue.parse
|
||||||
|
parser.url = issue.url
|
||||||
|
Main.dl parser
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,10 @@ module Muse::Dl
|
||||||
@output = output_file unless @output != DEFAULT_FILE_NAME
|
@output = output_file unless @output != DEFAULT_FILE_NAME
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def force_set_output(output_file : String)
|
||||||
|
@output = output_file
|
||||||
|
end
|
||||||
|
|
||||||
def reset_output_file
|
def reset_output_file
|
||||||
@output = DEFAULT_FILE_NAME
|
@output = DEFAULT_FILE_NAME
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue