From 03fccde754606a5d520e9bba6d215c7450898cd4 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 18:36:01 +0530 Subject: [PATCH] Adds support for final journal downloads --- src/fetch.cr | 5 +++++ src/infoparser.cr | 4 ++++ src/issue.cr | 30 +++++++++++++++++++++--------- src/journal.cr | 8 ++++++-- src/muse-dl.cr | 26 +++++++++++++++----------- src/parser.cr | 4 ++++ 6 files changed, 55 insertions(+), 22 deletions(-) diff --git a/src/fetch.cr b/src/fetch.cr index 963c894..213bdc5 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -27,6 +27,11 @@ module Muse::Dl File.delete(fns) if File.exists?(fns) end + def self.cleanup_articles(tmp_path : String, id : String) + fns = article_file_name(id, tmp_path) + File.delete(fns) if File.exists?(fns) + end + def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true) tmp_pdf_file = "#{file_name}.tmp" if File.exists? file_name diff --git a/src/infoparser.cr b/src/infoparser.cr index 8f10d28..7e331db 100644 --- a/src/infoparser.cr +++ b/src/infoparser.cr @@ -42,6 +42,10 @@ module Muse::Dl end end + def self.journal_title(myhtml : Myhtml::Parser) + myhtml.css("#journal_about_info .title").map(&.inner_text).to_a[0].strip + end + def self.author(myhtml : Myhtml::Parser) myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("
", ", ").gsub("\n", " ") end diff --git a/src/issue.cr b/src/issue.cr index 8f759d2..73a83fb 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -16,6 +16,8 @@ module Muse::Dl date : String | Nil, journal_title : String | Nil + setter :journal_title + def initialize(id : String, response : String | Nil = nil) @id = id @url = "https://muse.jhu.edu/issue/#{id}" @@ -31,6 +33,11 @@ module Muse::Dl false end + def parse + html = Crest.get(@url).to_s + parse(html) + end + def parse(html : String) h = Myhtml::Parser.new html @info = InfoParser.infobox(h) @@ -47,25 +54,30 @@ module Muse::Dl @volume = /Volume (\d+)/.match(t).try &.[1] @number = /Number (\d+)/.match(t).try &.[1] @number = /Issue (\d+)/.match(t).try &.[1] unless @number - @date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1] + @date = /((January|February|March|April|May|June|July|August|September|October|November|December|Sring|Winter|Fall|Summer) (\d+))/.match(t).try &.[1] @date = /(\d{4})/.match(t).try &.[1] unless @date end end def parse_contents(myhtml : Myhtml::Parser) - journal_title_a = myhtml.css("#journal_banner_title a").first - if journal_title_a - @journal_title = journal_title_a.inner_text + unless @journal_title + journal_title_a = myhtml.css("#journal_banner_title a").first + if journal_title_a + @journal_title = journal_title_a.inner_text + end end myhtml.css(".articles_list_text ol").each do |ol| link = ol.css("li.title a").first title = link.inner_text - pages = ol.css("li.pg").first.try &.inner_text - matches = /(\d+)-(\d+)/.match pages - if matches - start_page = matches[1].to_i - end_page = matches[2].to_i + pages = ol.css("li.pg") + if pages.size > 0 + p = pages.first.try &.inner_text + matches = /(\d+)-(\d+)/.match p + if matches + start_page = matches[1].to_i + end_page = matches[2].to_i + end end ol.css("a").each do |l| diff --git a/src/journal.cr b/src/journal.cr index 45b6214..d431824 100644 --- a/src/journal.cr +++ b/src/journal.cr @@ -3,11 +3,12 @@ require "./issue.cr" module Muse::Dl class Journal - getter :info, :summary, :publisher, :issues + getter :info, :summary, :publisher, :issues, :title @info = Hash(String, String).new @summary : String @publisher : String @issues = [] of Muse::Dl::Issue + @title : String private getter :h @@ -16,6 +17,7 @@ module Muse::Dl @info = InfoParser.infobox(h) @summary = InfoParser.summary(h) @publisher = InfoParser.journal_publisher(h) + @title = InfoParser.journal_title(h) parse_volumes(h) end @@ -32,7 +34,9 @@ module Muse::Dl matches = /\/issue\/(\d+)/.match link if matches - @issues.push Muse::Dl::Issue.new matches[1] + issue = Muse::Dl::Issue.new matches[1] + issue.journal_title = @title + @issues.push issue end end end diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 47095ab..19693a8 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -12,6 +12,7 @@ module Muse::Dl class Main def self.dl(parser : Parser) url = parser.url + puts "Downloading #{url}" thing = Fetch.get_info(url) if url return unless thing @@ -78,7 +79,7 @@ module Muse::Dl FileUtils.rm source if parser.cleanup elsif thing.is_a? Muse::Dl::Issue # Will have no effect if parser has a custom title - parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf" + parser.force_set_output Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf" # If file exists and we can't clobber if File.exists?(parser.output) && parser.clobber == false @@ -88,7 +89,6 @@ module Muse::Dl temp_stitched_file = nil pdf_builder = Pdftk.new(parser.tmp) - # ## TODO till 111 thing.articles.each do |article| begin Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first) @@ -101,21 +101,25 @@ module Muse::Dl # Stitch the PDFs together temp_stitched_file = pdf_builder.stitch_articles article_ids - # TODO: Add metadata for each Issue pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) # temp_stitched_file.delete if temp_stitched_file puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first puts "DL: #{url}. Saved final output to #{parser.output}" - # Cleanup the chapter files - # TODO - # if parser.cleanup - # thing.articles.each do |c| - # Fetch.cleanup(parser.tmp, c[0]) - # end - # end - #### + # Cleanup the issue files + if parser.cleanup + thing.articles.each do |a| + Fetch.cleanup_articles(parser.tmp, a.id) + end + end + elsif thing.is_a? Muse::Dl::Journal + thing.issues.each do |issue| + # Update the issue + issue.parse + parser.url = issue.url + Main.dl parser + end end end diff --git a/src/parser.cr b/src/parser.cr index a98d27e..e462445 100644 --- a/src/parser.cr +++ b/src/parser.cr @@ -26,6 +26,10 @@ module Muse::Dl @output = output_file unless @output != DEFAULT_FILE_NAME end + def force_set_output(output_file : String) + @output = output_file + end + def reset_output_file @output = DEFAULT_FILE_NAME end