From fcc4f0c48b2c06611dd446e204fd3ac0c4dddadc Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 00:38:16 +0530 Subject: [PATCH 01/19] Clear out the Producer/Creator on the PDF --- src/pdftk.cr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pdftk.cr b/src/pdftk.cr index 961f439..4b5a01c 100644 --- a/src/pdftk.cr +++ b/src/pdftk.cr @@ -83,10 +83,10 @@ module Muse::Dl text = <<-EOT InfoBegin InfoKey: Creator - InfoValue: Project MUSE (https://muse.jhu.edu/) + InfoValue: InfoBegin InfoKey: Producer - InfoValue: Muse-DL/#{Muse::Dl::VERSION} + InfoValue: InfoBegin InfoKey: Title InfoValue: #{book.title} From d8702b2fcb3020fe65c92af16ac3d192f92ca8fb Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 00:38:31 +0530 Subject: [PATCH 02/19] Initial work on parsing the journal page --- spec/fixtures/journal-159.html | 1522 ++++++++++++++++++++++++++++++++ spec/journal_spec.cr | 23 + src/fetch.cr | 2 +- src/infoparser.cr | 6 +- src/issue.cr | 6 + src/journal.cr | 18 +- 6 files changed, 1573 insertions(+), 4 deletions(-) create mode 100644 spec/fixtures/journal-159.html create mode 100644 spec/journal_spec.cr create mode 100644 src/issue.cr diff --git a/spec/fixtures/journal-159.html b/spec/fixtures/journal-159.html new file mode 100644 index 0000000..e46aded --- /dev/null +++ b/spec/fixtures/journal-159.html @@ -0,0 +1,1522 @@ + + + + + + + + + + + + + + + Project MUSE - portal: Libraries and the Academy + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + + + + +
+
+ + +
+ + +
+ + + MUSE Alert +
+ + + + +
+

About this Journal

+
+ +
+ + + +
+ +
+
+
+

Table of Contents

+
+
+ + + + +
+
+ + + + +

Volume 20, 2020

+
    + + +
  1. + Volume 20, Number 1, January 2020 + Free Access +
  2. + + + +

    Volume 19, 2019

    +
      + + +
    1. + Volume 19, Number 4, October 2019 + Free Access +
    2. + + + +
    3. + Volume 19, Number 3, July 2019 + Free Access +
    4. + + + +
    5. + Volume 19, Number 2, April 2019 + Free Access +
    6. + + + +
    7. + Volume 19, Number 1, January 2019 + Free Access +
    8. + + + +

      Volume 18, 2018

      +
        + + +
      1. + Volume 18, Number 4, October 2018 + Free Access +
      2. + + + +
      3. + Volume 18, Number 3, July 2018 + Free Access +
      4. + + + +
      5. + Volume 18, Number 2, April 2018 + Free Access +
      6. + + + +
      7. + Volume 18, Number 1, January 2018 + Free Access +
      8. + + + +

        Volume 17, 2017

        +
          + + +
        1. + Volume 17, Number 4, October 2017 + Free Access +
        2. + + + +
        3. + Volume 17, Number 3, July 2017 + Free Access +
        4. + + + +
        5. + Volume 17, Number 2, April 2017 + Free Access +
        6. + + + +
        7. + Volume 17, Number 1, January 2017 + Free Access +
        8. + + + +

          Volume 16, 2016

          +
            + + +
          1. + Volume 16, Number 4, October 2016 + Free Access +
          2. + + + +
          3. + Volume 16, Number 3, July 2016 + Free Access +
          4. + + + +
          5. + Volume 16, Number 2, April 2016 + Free Access +
          6. + + + +
          7. + Volume 16, Number 1, January 2016 + Free Access +
          8. + + + +

            Volume 15, 2015

            +
              + + +
            1. + Volume 15, Number 4, October 2015 + Free Access +
            2. + + + +
            3. + Volume 15, Number 3, July 2015 + Free Access +
            4. + + + +
            5. + Volume 15, Number 2, April 2015 + Free Access +
            6. + + + +
            7. + Volume 15, Number 1, January 2015 + Free Access +
            8. + + + +

              Volume 14, 2014

              +
                + + +
              1. + Volume 14, Number 4, October 2014 + Free Access +
              2. + + + +
              3. + Volume 14, Number 3, July 2014 + Free Access +
              4. + + + +
              5. + Volume 14, Number 2, April 2014 + Free Access +
              6. + + + +
              7. + Volume 14, Number 1, January 2014 + Free Access +
              8. + + + +

                Volume 13, 2013

                +
                  + + +
                1. + Volume 13, Number 4, October 2013 + Free Access +
                2. + + + +
                3. + Volume 13, Number 3, July 2013 + Free Access +
                4. + + + +
                5. + Volume 13, Number 2, April 2013 + Free Access +
                6. + + + +
                7. + Volume 13, Number 1, January 2013 + Free Access +
                8. + + + +

                  Volume 12, 2012

                  +
                    + + +
                  1. + Volume 12, Number 4, October 2012 + Free Access +
                  2. + + + +
                  3. + Volume 12, Number 3, July 2012 + Free Access +
                  4. + + + +
                  5. + Volume 12, Number 2, April 2012 + Free Access +
                  6. + + + +
                  7. + Volume 12, Number 1, January 2012 + Free Access +
                  8. + + + +

                    Volume 11, 2011

                    +
                      + + +
                    1. + Volume 11, Number 4, October 2011 + Free Access +
                    2. + + + +
                    3. + Volume 11, Number 3, July 2011 + Free Access +
                    4. + + + +
                    5. + Volume 11, Number 2, April 2011 + Free Access +
                    6. + + + +
                    7. + Volume 11, Number 1, January 2011 + Free Access +
                    8. + + + +

                      Volume 10, 2010

                      +
                        + + +
                      1. + Volume 10, Number 4, October 2010 + Free Access +
                      2. + + + +
                      3. + Volume 10, Number 3, July 2010 + Free Access +
                      4. + + + +
                      5. + Volume 10, Number 2, April 2010 + Free Access +
                      6. + + + +
                      7. + Volume 10, Number 1, January 2010 + Free Access +
                      8. + + + +

                        Volume 9, 2009

                        +
                          + + +
                        1. + Volume 9, Number 4, October 2009 + Free Access +
                        2. + + + +
                        3. + Volume 9, Number 3, July 2009 + Free Access +
                        4. + + + +
                        5. + Volume 9, Number 2, April 2009 + Free Access +
                        6. + + + +
                        7. + Volume 9, Number 1, January 2009 + Free Access +
                        8. + + + +

                          Volume 8, 2008

                          +
                            + + +
                          1. + Volume 8, Number 4, October 2008 + Free Access +
                          2. + + + +
                          3. + Volume 8, Number 3, July 2008 + Free Access +
                          4. + + + +
                          5. + Volume 8, Number 2, April 2008 + Free Access +
                          6. + + + +
                          7. + Volume 8, Number 1, January 2008 + Free Access +
                          8. + + + +

                            Volume 7, 2007

                            +
                              + + +
                            1. + Volume 7, Number 4, October 2007 + Free Access +
                            2. + + + +
                            3. + Volume 7, Number 3, July 2007 + Free Access +
                            4. + + + +
                            5. + Volume 7, Number 2, April 2007 + Free Access +
                            6. + + + +
                            7. + Volume 7, Number 1, January 2007 + Free Access +
                            8. + + + +

                              Volume 6, 2006

                              +
                                + + +
                              1. + Volume 6, Number 4, October 2006 + Free Access +
                              2. + + + +
                              3. + Volume 6, Number 3, July 2006 + Free Access +
                              4. + + + +
                              5. + Volume 6, Number 2, April 2006 + Free Access +
                              6. + + + +
                              7. + Volume 6, Number 1, January 2006 + Free Access +
                              8. + + + +

                                Volume 5, 2005

                                +
                                  + + +
                                1. + Volume 5, Number 4, October 2005 + Free Access +
                                2. + + + +
                                3. + Volume 5, Number 3, July 2005 + Free Access +
                                4. + + + +
                                5. + Volume 5, Number 2, April 2005 + Free Access +
                                6. + + + +
                                7. + Volume 5, Number 1, January 2005 + Free Access +
                                8. + + + +

                                  Volume 4, 2004

                                  +
                                    + + +
                                  1. + Volume 4, Number 4, October 2004 + Free Access +
                                  2. + + + +
                                  3. + Volume 4, Number 3, July 2004 + Free Access +
                                  4. + + + +
                                  5. + Volume 4, Number 2, April 2004 + Free Access +
                                  6. + + + +
                                  7. + Volume 4, Number 1, January 2004 + Free Access +
                                  8. + + + +

                                    Volume 3, 2003

                                    +
                                      + + +
                                    1. + Volume 3, Number 4, October 2003 + Free Access +
                                    2. + + + +
                                    3. + Volume 3, Number 3, July 2003 + Free Access +
                                    4. + + + +
                                    5. + Volume 3, Number 2, April 2003 + Free Access +
                                    6. + + + +
                                    7. + Volume 3, Number 1, January 2003 + Free Access +
                                    8. + + + +

                                      Volume 2, 2002

                                      +
                                        + + +
                                      1. + Volume 2, Number 4, October 2002 + Free Access +
                                      2. + + + +
                                      3. + Volume 2, Number 3, July 2002 + Free Access +
                                      4. + + + +
                                      5. + Volume 2, Number 2, April 2002 + Free Access +
                                      6. + + + +
                                      7. + Volume 2, Number 1, January 2002 + Free Access +
                                      8. + + + +

                                        Volume 1, 2001

                                        +
                                          + + +
                                        1. + Volume 1, Number 4, October 2001 + Free Access +
                                        2. + + + +
                                        3. + Volume 1, Number 3, July 2001 + Free Access +
                                        4. + + + +
                                        5. + Volume 1, Number 2, April 2001 + Free Access +
                                        6. + + + +
                                        7. + Volume 1, Number 1, January 2001 + Free Access +
                                        8. + +
                                        + + +
+ +
+ + +
+ + + + + +
+
+
+ +
+
+
+

Additional Information

+
+
+ +
+
+ ISSN +
+
+ 1530-7131 +
+
+ + +
+
+ Print ISSN +
+
+ 1531-2542 +
+
+ + + + + + + + + +
+
+ Coverage Statement +
+
+ Vol. 1 (2001) through current issue +
+
+ + + +
+
+ Open Access +
+
+ + No + +
+
+ + + + + + +
+
+
+ + + + + +
+

Additional Materials

+
+ +
+ + + + + +
+

Additional Issue Materials

+
+
+ +
+ +
+ + +
+
+ + +
+ + + + + + + + + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + diff --git a/spec/journal_spec.cr b/spec/journal_spec.cr new file mode 100644 index 0000000..bee1b04 --- /dev/null +++ b/spec/journal_spec.cr @@ -0,0 +1,23 @@ +require "./spec_helper" + +describe Muse::Dl::Journal do + html = File.new("spec/fixtures/journal-159.html").gets_to_end + j = Muse::Dl::Journal.new html + + it "it should parse the infobox for 159" do + j.info["ISSN"].should eq "1530-7131" + j.info["Print ISSN"].should eq "1531-2542" + j.info["Coverage Statement"].should eq "Vol. 1 (2001) through current issue" + j.info["Open Access"].should eq "No" + end + + it "should parser summary" do + j.summary.should eq <<-EOT + Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association. + EOT + end + + it "should parse publisher" do + j.publisher.should eq "Johns Hopkins University Press" + end +end diff --git a/src/fetch.cr b/src/fetch.cr index 004f9e0..6bc987a 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -98,7 +98,7 @@ module Muse::Dl puts "Downloaded #{chapter_id}" end - def self.get_info(url : String) : Muse::Dl::Thing | Nil + def self.get_info(url : String) match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url if match begin diff --git a/src/infoparser.cr b/src/infoparser.cr index c28252a..514fb91 100644 --- a/src/infoparser.cr +++ b/src/infoparser.cr @@ -50,9 +50,13 @@ module Muse::Dl myhtml.css("#book_about_info .pub a").map(&.inner_text).to_a[0].strip end + def self.journal_publisher(myhtml : Myhtml::Parser) + myhtml.css(".card_publisher a").map(&.inner_text).to_a[0].strip + end + def self.summary(myhtml : Myhtml::Parser) begin - return myhtml.css("#book_about_info .card_summary").map(&.inner_text).to_a[0].strip + return myhtml.css(".card_summary").map(&.inner_text).to_a[0].strip rescue e : Exception STDERR.puts "Could not fetch summary" return "NA" diff --git a/src/issue.cr b/src/issue.cr new file mode 100644 index 0000000..99e65a3 --- /dev/null +++ b/src/issue.cr @@ -0,0 +1,6 @@ +require "./thing.cr" + +module Muse::Dl + class Issue < Muse::Dl::Thing + end +end diff --git a/src/journal.cr b/src/journal.cr index 1f3323a..55c5eca 100644 --- a/src/journal.cr +++ b/src/journal.cr @@ -1,6 +1,20 @@ -require "./thing.cr" +require "./infoparser.cr" +require "myhtml" module Muse::Dl - class Journal < Muse::Dl::Thing + class Journal + getter :info, :summary, :publisher + @info = Hash(String, String).new + @summary : String + @publisher : String + + private getter :h + + def initialize(html) + @h = Myhtml::Parser.new html + @info = InfoParser.infobox(h) + @summary = InfoParser.summary(h) + @publisher = InfoParser.journal_publisher(h) + end end end From 4a358d0cb0d522822683772b736b259070496ab1 Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 00:48:36 +0530 Subject: [PATCH 03/19] Journal parser now parses all issues --- spec/journal_spec.cr | 5 +++++ src/issue.cr | 9 ++++++++- src/journal.cr | 17 +++++++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/spec/journal_spec.cr b/spec/journal_spec.cr index bee1b04..7e0e5ec 100644 --- a/spec/journal_spec.cr +++ b/spec/journal_spec.cr @@ -20,4 +20,9 @@ describe Muse::Dl::Journal do it "should parse publisher" do j.publisher.should eq "Johns Hopkins University Press" end + + it "should return issues" do + j.issues[0].id.should eq "41793" + j.issues[-1].id.should eq "1578" + end end diff --git a/src/issue.cr b/src/issue.cr index 99e65a3..8d51421 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -1,6 +1,13 @@ require "./thing.cr" module Muse::Dl - class Issue < Muse::Dl::Thing + class Issue + @id : String + + getter :id + + def initialize(id : String) + @id = id + end end end diff --git a/src/journal.cr b/src/journal.cr index 55c5eca..9c828f6 100644 --- a/src/journal.cr +++ b/src/journal.cr @@ -1,12 +1,13 @@ require "./infoparser.cr" -require "myhtml" +require "./issue.cr" module Muse::Dl class Journal - getter :info, :summary, :publisher + getter :info, :summary, :publisher, :issues @info = Hash(String, String).new @summary : String @publisher : String + @issues = [] of Muse::Dl::Issue private getter :h @@ -15,6 +16,18 @@ module Muse::Dl @info = InfoParser.infobox(h) @summary = InfoParser.summary(h) @publisher = InfoParser.journal_publisher(h) + parse_volumes(h) + end + + def parse_volumes(myhtml : Myhtml::Parser) + myhtml.css("#available_issues_list_text a").each do |a| + link = a.attribute_by("href").to_s + + matches = /\/issue\/(\d+)/.match link + if matches + @issues.push Muse::Dl::Issue.new matches[1] + end + end end end end From ff225b12c63632797fe716f3f85dc82c28d5f2e1 Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 00:55:59 +0530 Subject: [PATCH 04/19] Fix filenames with double-quotes --- spec/util_spec.cr | 9 +++++++++ src/util.cr | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 spec/util_spec.cr diff --git a/spec/util_spec.cr b/spec/util_spec.cr new file mode 100644 index 0000000..c808941 --- /dev/null +++ b/spec/util_spec.cr @@ -0,0 +1,9 @@ +require "../src/util" +require "./spec_helper" + +describe Muse::Dl::Util do + it "should sanitize filenames properly" do + fn = Muse::Dl::Util.slug_filename("Hello world - \" :A$3, a story; a poem|chapter") + fn.should eq "Hello world - - -A-3, a story- a poem-chapter" + end +end diff --git a/src/util.cr b/src/util.cr index 5ed414a..0ddd1ae 100644 --- a/src/util.cr +++ b/src/util.cr @@ -2,7 +2,7 @@ module Muse::Dl class Util # Generates a safe filename def self.slug_filename(input : String) - input.strip.tr("\u{202E}%$|:;/\t\r\n\\", "-") + input.strip.tr("\u{202E}%$|:;/\"\t\r\n\\", "-") end end end From f11f64b9d5e3506612c0741d91f96dd24ceceebc Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 01:48:48 +0530 Subject: [PATCH 05/19] Adds webmock --- shard.lock | 4 + shard.yml | 7 +- spec/fetch_spec.cr | 4 + spec/fixtures/chapter-2379787.html | 359 +++++++ spec/fixtures/issue-41793.html | 1603 ++++++++++++++++++++++++++++ spec/issue_spec.cr | 35 + src/article.cr | 7 + src/issue.cr | 21 +- 8 files changed, 2038 insertions(+), 2 deletions(-) create mode 100644 spec/fixtures/chapter-2379787.html create mode 100644 spec/fixtures/issue-41793.html create mode 100644 spec/issue_spec.cr create mode 100644 src/article.cr diff --git a/shard.lock b/shard.lock index 9984f5f..7a23ffd 100644 --- a/shard.lock +++ b/shard.lock @@ -12,3 +12,7 @@ shards: github: kostya/myhtml version: 1.5.1 + webmock: + github: manastech/webmock.cr + commit: 78bb0e3b5850c700da0e7fbdd2d6c180cc4a061b + diff --git a/shard.yml b/shard.yml index 37050c5..4e42cbc 100644 --- a/shard.yml +++ b/shard.yml @@ -15,4 +15,9 @@ dependencies: myhtml: github: kostya/myhtml crest: - github: mamantoha/crest \ No newline at end of file + github: mamantoha/crest + +development_dependencies: + webmock: + github: manastech/webmock.cr + branch: master \ No newline at end of file diff --git a/spec/fetch_spec.cr b/spec/fetch_spec.cr index 6e97dfd..c01ed8f 100644 --- a/spec/fetch_spec.cr +++ b/spec/fetch_spec.cr @@ -2,6 +2,10 @@ require "./spec_helper" # require "errors/muse_corrupt_pdf.cr" describe Muse::Dl::Book do + headers = {"Content-Type" => "text/html"} + WebMock.stub(:get, "https://muse.jhu.edu/chapter/2379787/pdf") + .to_return(body_io: File.new("spec/fixtures/chapter-2379787.html"), headers: headers) + it "should notice the unable to construct chapter PDF error" do f = "/tmp/chapter-2379787.pdf" File.delete(f) if File.exists? f diff --git a/spec/fixtures/chapter-2379787.html b/spec/fixtures/chapter-2379787.html new file mode 100644 index 0000000..2e5e3c2 --- /dev/null +++ b/spec/fixtures/chapter-2379787.html @@ -0,0 +1,359 @@ + + + + + + + + + + + + + + + + + Project MUSE + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + +
+ +
+ ErrorUnable to construct chapter PDF + +
+
+ + + + + + + + + + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + + diff --git a/spec/fixtures/issue-41793.html b/spec/fixtures/issue-41793.html new file mode 100644 index 0000000..753844a --- /dev/null +++ b/spec/fixtures/issue-41793.html @@ -0,0 +1,1603 @@ + + + + + + + + + + + + + + + Project MUSE - portal: Libraries and the Academy-Volume 20, Number 1, January 2020 + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + + + + + + +
+
+ +
+ + + +
+ + + MUSE Alert +
+ + + + + +
+

In this Issue

+
+ +
+ + + +
+ + +
+
+
+

Table of Contents

+
+ + + +
+ +
+
+ + + +
+ + + + + +
+
+
    + +
  1. + Coming of Age: portal at 20 + +
  2. + + +
  3. + + Marianne Ryan + +
  4. + + + +
  5. pp. 1-5
  6. +
  7. DOI: 10.1353/pla.2020.0000
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + Techniques to Imagine, Fund, and Build the Academic Library of Your Dreams + +
  2. + + +
  3. + + Janette S. Blackburn + +
  4. + + + +
  5. pp. 7-14
  6. +
  7. DOI: 10.1353/pla.2020.0001
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + The International Open Access Movement and Its Status in Pakistan + +
  2. + + +
  3. + + Arslan Sheikh + +
  4. + + + +
  5. pp. 15-31
  6. +
  7. DOI: 10.1353/pla.2020.0002
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + Big Data and Academic Libraries: The Quest for Informed Decision-Making + +
  2. + + +
  3. + + Tiffini A. Travis, + + Christian Ramirez + +
  4. + + + +
  5. pp. 33-47
  6. +
  7. DOI: 10.1353/pla.2020.0003
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + From Service Role to Partnership: Faculty Voices on Collaboration with Librarians + +
  2. + + +
  3. + + Maria A. Perez-Stable, + + Judith M. Arnold, + + LuMarie F. Guth, + + Patricia Fravel Vander Meer + +
  4. + + + +
  5. pp. 49-72
  6. +
  7. DOI: 10.1353/pla.2020.0004
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + Attitudes of North American Academics toward Open Access Scholarly Journals + +
  2. + + +
  3. + + Elizabeth D. Dalton, + + Carol Tenopir, + + Bo-Christer Björk + +
  4. + + + +
  5. pp. 73-100
  6. +
  7. DOI: 10.1353/pla.2020.0005
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + A Multi-Method Information Literacy Assessment Program: Foundation and Early Results + +
  2. + + +
  3. + + William H. Walters, + + Sarah E. Sheehan, + + Amy E. Handfield, + + Bernadette M. López-Fitzsimmons, + + Susanne Markgren, + + Laurin Paradise + +
  4. + + + +
  5. pp. 101-135
  6. +
  7. DOI: 10.1353/pla.2020.0006
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + The Problem with Grit: Dismantling Deficit Thinking in Library Instruction + +
  2. + + +
  3. + + Eamon Tewell + +
  4. + + + +
  5. pp. 137-159
  6. +
  7. DOI: 10.1353/pla.2020.0007
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + Apprenticing Researchers: Exploring Upper-Division Students' Information Literacy Competencies + +
  2. + + +
  3. + + Sara L. Davidson Squibb, + + Anne Zanzucchi + +
  4. + + + +
  5. pp. 161-185
  6. +
  7. DOI: 10.1353/pla.2020.0008
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + E-Book Information Behaviors and Formats among Graduate Students in Information Sciences + +
  2. + + +
  3. + + Daniel G. Tracy + +
  4. + + + +
  5. pp. 187-220
  6. +
  7. DOI: 10.1353/pla.2020.0009
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + + + +
+
+ +
+
+ + + + + + + +
+ + + + + + + + +
+
+
+
+ +
+

Previous Issue

+
+

Volume 19, Number 4, October 2019

+ +
+
+ +
+
+ +
+
+
+

Additional Information

+
+
+ +
+
+ ISSN +
+
+ 1530-7131 +
+
+ + +
+
+ Print ISSN +
+
+ 1531-2542 +
+
+ + + + + + + + +
+
+ Launched on MUSE +
+
+ 2020-02-05 +
+
+ + + + +
+
+ Open Access +
+
+ + No + +
+
+ + + + + + +
+
+
+ + +
+

Copyright

+
+ + + + + +
+ + + +
+

Additional Issue Materials

+
+ + +
+ + +
+
+ + + + +
+
+ + + + + + + + + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + diff --git a/spec/issue_spec.cr b/spec/issue_spec.cr new file mode 100644 index 0000000..c24ae41 --- /dev/null +++ b/spec/issue_spec.cr @@ -0,0 +1,35 @@ +require "../src/issue" +require "./spec_helper" +require "webmock" + +describe Muse::Dl::Issue do + WebMock.stub(:get, "https://muse.jhu.edu/issue/41793") + .to_return(body: File.new("spec/fixtures/issue-41793.html").gets_to_end) + + issue = Muse::Dl::Issue.new "41793" + issue.parse + + it "should initialize correctly" do + issue.id.should eq "41793" + issue.url.should eq "https://muse.jhu.edu/issue/41793" + end + + # it "should parse info correctly" do + # issue.info["ISSN"].should eq "1530-7131" + # issue.info["Print ISSN"].should eq "1531-2542" + # issue.info["Launched on MUSE"].should eq "2020-02-05" + # issue.info["Open Access"].should eq "No" + + # issue.title.should eq "Volume 20, Number 1, January 2020" + # end + + # it "should parser summary" do + # issue.summary.should eq <<-EOT + # Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association. + # EOT + # end + + # it "should parse publisher" do + # issue.publisher.should eq "Johns Hopkins University Press" + # end +end diff --git a/src/article.cr b/src/article.cr new file mode 100644 index 0000000..1b94b30 --- /dev/null +++ b/src/article.cr @@ -0,0 +1,7 @@ +require "./infoparser.cr" +require "./issue.cr" + +module Muse::Dl + class Article + end +end diff --git a/src/issue.cr b/src/issue.cr index 8d51421..f04357e 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -1,13 +1,32 @@ require "./thing.cr" +require "./fetch.cr" +require "./article.cr" module Muse::Dl class Issue @id : String + @title : String | Nil + @articles : Array(Muse::Dl::Article) + @url : String + @info : Hash(String, String) | Nil + @summary : String | Nil + @publisher : String | Nil - getter :id + getter :id, :title, :articles, :url, :summary, :publisher, :info def initialize(id : String) @id = id + @url = "https://muse.jhu.edu/issue/#{id}" + @title = "NA" + @articles = [] of Muse::Dl::Article + end + + def parse + html = Crest.get(url).to_s + h = Myhtml::Parser.new html + @info = InfoParser.infobox(h) + @summary = InfoParser.summary(h) + @publisher = InfoParser.journal_publisher(h) end end end From 6b278531fd45f877c78eb0852c7f052982e1a546 Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 01:50:40 +0530 Subject: [PATCH 06/19] Infobox is parsing for an issue now --- spec/issue_spec.cr | 14 +++++++------- src/issue.cr | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/spec/issue_spec.cr b/spec/issue_spec.cr index c24ae41..fe71e54 100644 --- a/spec/issue_spec.cr +++ b/spec/issue_spec.cr @@ -14,14 +14,14 @@ describe Muse::Dl::Issue do issue.url.should eq "https://muse.jhu.edu/issue/41793" end - # it "should parse info correctly" do - # issue.info["ISSN"].should eq "1530-7131" - # issue.info["Print ISSN"].should eq "1531-2542" - # issue.info["Launched on MUSE"].should eq "2020-02-05" - # issue.info["Open Access"].should eq "No" + it "should parse info correctly" do + issue.info["ISSN"].should eq "1530-7131" + issue.info["Print ISSN"].should eq "1531-2542" + issue.info["Launched on MUSE"].should eq "2020-02-05" + issue.info["Open Access"].should eq "No" - # issue.title.should eq "Volume 20, Number 1, January 2020" - # end + # issue.title.should eq "Volume 20, Number 1, January 2020" + end # it "should parser summary" do # issue.summary.should eq <<-EOT diff --git a/src/issue.cr b/src/issue.cr index f04357e..4c31ad6 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -8,7 +8,7 @@ module Muse::Dl @title : String | Nil @articles : Array(Muse::Dl::Article) @url : String - @info : Hash(String, String) | Nil + @info : Hash(String, String) @summary : String | Nil @publisher : String | Nil @@ -18,6 +18,7 @@ module Muse::Dl @id = id @url = "https://muse.jhu.edu/issue/#{id}" @title = "NA" + @info = Hash(String, String).new @articles = [] of Muse::Dl::Article end From 7b48731afe56044adb8c4f94fb743300913a31f5 Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 01:52:07 +0530 Subject: [PATCH 07/19] Parse title and publisher for issues --- spec/issue_spec.cr | 9 ++++----- src/infoparser.cr | 4 ++++ src/issue.cr | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/spec/issue_spec.cr b/spec/issue_spec.cr index fe71e54..bd7d77e 100644 --- a/spec/issue_spec.cr +++ b/spec/issue_spec.cr @@ -19,8 +19,7 @@ describe Muse::Dl::Issue do issue.info["Print ISSN"].should eq "1531-2542" issue.info["Launched on MUSE"].should eq "2020-02-05" issue.info["Open Access"].should eq "No" - - # issue.title.should eq "Volume 20, Number 1, January 2020" + issue.title.should eq "Volume 20, Number 1, January 2020" end # it "should parser summary" do @@ -29,7 +28,7 @@ describe Muse::Dl::Issue do # EOT # end - # it "should parse publisher" do - # issue.publisher.should eq "Johns Hopkins University Press" - # end + it "should parse publisher" do + issue.publisher.should eq "Johns Hopkins University Press" + end end diff --git a/src/infoparser.cr b/src/infoparser.cr index 514fb91..836c9e3 100644 --- a/src/infoparser.cr +++ b/src/infoparser.cr @@ -34,6 +34,10 @@ module Muse::Dl myhtml.css("#book_about_info .title").map(&.inner_text).to_a[0].strip end + def self.issue_title(myhtml : Myhtml::Parser) + myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip + end + def self.author(myhtml : Myhtml::Parser) myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("
", ", ").gsub("\n", " ") end diff --git a/src/issue.cr b/src/issue.cr index 4c31ad6..c8e004a 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -17,7 +17,6 @@ module Muse::Dl def initialize(id : String) @id = id @url = "https://muse.jhu.edu/issue/#{id}" - @title = "NA" @info = Hash(String, String).new @articles = [] of Muse::Dl::Article end @@ -26,6 +25,7 @@ module Muse::Dl html = Crest.get(url).to_s h = Myhtml::Parser.new html @info = InfoParser.infobox(h) + @title = InfoParser.issue_title(h) @summary = InfoParser.summary(h) @publisher = InfoParser.journal_publisher(h) end From 3e56efed52f5cfd2cbd0efd32ded9438c82358f2 Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 01:52:28 +0530 Subject: [PATCH 08/19] Parses summary for issueS --- spec/issue_spec.cr | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spec/issue_spec.cr b/spec/issue_spec.cr index bd7d77e..dd5ae47 100644 --- a/spec/issue_spec.cr +++ b/spec/issue_spec.cr @@ -22,11 +22,11 @@ describe Muse::Dl::Issue do issue.title.should eq "Volume 20, Number 1, January 2020" end - # it "should parser summary" do - # issue.summary.should eq <<-EOT - # Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association. - # EOT - # end + it "should parser summary" do + issue.summary.should eq <<-EOT + Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association. + EOT + end it "should parse publisher" do issue.publisher.should eq "Johns Hopkins University Press" From c01e0713289822dfed0cdb8546acf7b21cbc16c7 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 16 Jun 2020 19:13:52 +0530 Subject: [PATCH 09/19] [make] Adds tests to Makefile --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 98a984e..5dcf3dd 100644 --- a/Makefile +++ b/Makefile @@ -7,4 +7,7 @@ release: # Then extract the image | extract the layer.tar file (we only have one layer) | extract the muse-dl-static file docker image save muse-dl-static | tar xf - --wildcards "*/layer.tar" -O | tar xf - "muse-dl-static" # And move it to the bin/ directory - mv -f muse-dl-static bin/ \ No newline at end of file + mv -f muse-dl-static bin/ + +test: + crystal spec From aa392eaa640fc259c15d58298e8f36009dc006e4 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 16 Jun 2020 19:27:11 +0530 Subject: [PATCH 10/19] Adds support for parsing title to volume/number/date of a journal issue --- shard.lock | 6 +++--- spec/issue_spec.cr | 6 ++++++ src/issue.cr | 15 ++++++++++++++- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/shard.lock b/shard.lock index 7a23ffd..135a1dc 100644 --- a/shard.lock +++ b/shard.lock @@ -2,11 +2,11 @@ version: 1.0 shards: crest: github: mamantoha/crest - version: 0.24.1 + version: 0.25.1 http-client-digest_auth: github: mamantoha/http-client-digest_auth - version: 0.3.0 + version: 0.4.0 myhtml: github: kostya/myhtml @@ -14,5 +14,5 @@ shards: webmock: github: manastech/webmock.cr - commit: 78bb0e3b5850c700da0e7fbdd2d6c180cc4a061b + commit: bb3eab30f6c7d1fdc0a7ff14cd136d68e860d1a7 diff --git a/spec/issue_spec.cr b/spec/issue_spec.cr index dd5ae47..aa0d11d 100644 --- a/spec/issue_spec.cr +++ b/spec/issue_spec.cr @@ -22,6 +22,12 @@ describe Muse::Dl::Issue do issue.title.should eq "Volume 20, Number 1, January 2020" end + it "should parse title correctly" do + issue.volume.should eq "20" + issue.number.should eq "1" + issue.date.should eq "January 2020" + end + it "should parser summary" do issue.summary.should eq <<-EOT Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association. diff --git a/src/issue.cr b/src/issue.cr index c8e004a..aad8b20 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -11,8 +11,11 @@ module Muse::Dl @info : Hash(String, String) @summary : String | Nil @publisher : String | Nil + @volume : String | Nil + @number : String | Nil + @date : String | Nil - getter :id, :title, :articles, :url, :summary, :publisher, :info + getter :id, :title, :articles, :url, :summary, :publisher, :info, :volume, :number, :date def initialize(id : String) @id = id @@ -28,6 +31,16 @@ module Muse::Dl @title = InfoParser.issue_title(h) @summary = InfoParser.summary(h) @publisher = InfoParser.journal_publisher(h) + parse_title + end + + def parse_title + t = @title + unless t.nil? + @volume = /Volume (\d+)/.match(t).try &.[1] + @number = /Number (\d+)/.match(t).try &.[1] + @date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1] + end end end end From 04a2fe52ecb0054d991e402d0f3c82e3cd1646e8 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 14:08:28 +0530 Subject: [PATCH 11/19] Minor fixes, parse contents for issues --- spec/fetch_spec.cr | 1 + src/fetch.cr | 1 - src/issue.cr | 14 ++++++++++++++ src/muse-dl.cr | 2 +- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/spec/fetch_spec.cr b/spec/fetch_spec.cr index c01ed8f..abe1a72 100644 --- a/spec/fetch_spec.cr +++ b/spec/fetch_spec.cr @@ -1,4 +1,5 @@ require "./spec_helper" +require "webmock" # require "errors/muse_corrupt_pdf.cr" describe Muse::Dl::Book do diff --git a/src/fetch.cr b/src/fetch.cr index 6bc987a..3f581d8 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -63,7 +63,6 @@ module Muse::Dl content_type = response.headers["Content-Type"] if content_type.is_a? String if /html/.match content_type - puts response response.body.each_line do |line| # https://muse.jhu.edu/chapter/2383438/pdf # https://muse.jhu.edu/book/67393 diff --git a/src/issue.cr b/src/issue.cr index aad8b20..2e826f7 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -14,6 +14,7 @@ module Muse::Dl @volume : String | Nil @number : String | Nil @date : String | Nil + @issues : Array(Muse::Dl::Issue) getter :id, :title, :articles, :url, :summary, :publisher, :info, :volume, :number, :date @@ -22,6 +23,7 @@ module Muse::Dl @url = "https://muse.jhu.edu/issue/#{id}" @info = Hash(String, String).new @articles = [] of Muse::Dl::Article + @issues = [] of Muse::Dl::Issue end def parse @@ -32,6 +34,7 @@ module Muse::Dl @summary = InfoParser.summary(h) @publisher = InfoParser.journal_publisher(h) parse_title + parse_contents(h) end def parse_title @@ -42,5 +45,16 @@ module Muse::Dl @date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1] end end + + def parse_contents(myhtml : Myhtml::Parser) + myhtml.css("#available_issues_list_text a").each do |a| + link = a.attribute_by("href").to_s + + matches = /\/issue\/(\d+)/.match link + if matches + @issues.push Muse::Dl::Issue.new matches[1] + end + end + end end end diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 9b71f65..28368fc 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -51,7 +51,7 @@ module Muse::Dl end temp_stitched_file.delete if temp_stitched_file - puts "--dont-strip-first-page was on. Please validate PDF file for any errors." + puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first puts "DL: #{url}. Saved final output to #{parser.output}" # Cleanup the chapter files From f04e9b799ed0e7b85866830397cde0f87ef3d125 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 14:18:16 +0530 Subject: [PATCH 12/19] Removes input_pdf and initial work on article download --- shard.lock | 12 ++++++------ src/article.cr | 6 ++++++ src/fetch.cr | 6 +++++- src/muse-dl.cr | 31 ++++++++++++++----------------- src/parser.cr | 6 +----- 5 files changed, 32 insertions(+), 29 deletions(-) diff --git a/shard.lock b/shard.lock index 135a1dc..80ce5b8 100644 --- a/shard.lock +++ b/shard.lock @@ -1,18 +1,18 @@ -version: 1.0 +version: 2.0 shards: crest: - github: mamantoha/crest + git: https://github.com/mamantoha/crest.git version: 0.25.1 http-client-digest_auth: - github: mamantoha/http-client-digest_auth + git: https://github.com/mamantoha/http-client-digest_auth.git version: 0.4.0 myhtml: - github: kostya/myhtml + git: https://github.com/kostya/myhtml.git version: 1.5.1 webmock: - github: manastech/webmock.cr - commit: bb3eab30f6c7d1fdc0a7ff14cd136d68e860d1a7 + git: https://github.com/manastech/webmock.cr.git + version: 0.13.0+git.commit.bb3eab30f6c7d1fdc0a7ff14cd136d68e860d1a7 diff --git a/src/article.cr b/src/article.cr index 1b94b30..8da80b6 100644 --- a/src/article.cr +++ b/src/article.cr @@ -3,5 +3,11 @@ require "./issue.cr" module Muse::Dl class Article + @id : String + + def initialize(id : String) + @id = id + @url = "https://muse.jhu.edu/article/#{id}" + end end end diff --git a/src/fetch.cr b/src/fetch.cr index 3f581d8..acb96ab 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -98,7 +98,7 @@ module Muse::Dl end def self.get_info(url : String) - match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url + match = /https:\/\/muse.jhu.edu\/(book|journal|issue|article)\/(\d+)/.match url if match begin response = Crest.get(url).to_s @@ -107,6 +107,10 @@ module Muse::Dl return Muse::Dl::Book.new response when "journal" return Muse::Dl::Journal.new response + when "issue" + return Muse::Dl::Issue.new response + when "article" + return Muse::Dl::Article.new match[2] end rescue ex : Crest::NotFound raise Muse::Dl::Errors::InvalidLink.new("Error - could not download url: #{url}") diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 28368fc..a0534d8 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -30,25 +30,20 @@ module Muse::Dl temp_stitched_file = nil pdf_builder = Pdftk.new(parser.tmp) - unless parser.input_pdf - # Save each chapter - thing.chapters.each do |chapter| - begin - Fetch.save_chapter(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first) - rescue e : Muse::Dl::Errors::MuseCorruptPDF - STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" - return - end + # Save each chapter + thing.chapters.each do |chapter| + begin + Fetch.save_chapter(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first) + rescue e : Muse::Dl::Errors::MuseCorruptPDF + STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" + return end - chapter_ids = thing.chapters.map { |c| c[0] } - - # Stitch the PDFs together - temp_stitched_file = pdf_builder.stitch chapter_ids - pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) - else - x = parser.input_pdf - pdf_builder.add_metadata(File.open(x), parser.output, thing) if x end + chapter_ids = thing.chapters.map { |c| c[0] } + + # Stitch the PDFs together + temp_stitched_file = pdf_builder.stitch chapter_ids + pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) temp_stitched_file.delete if temp_stitched_file puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first @@ -60,6 +55,8 @@ module Muse::Dl Fetch.cleanup(parser.tmp, c[0]) end end + elsif thing.is_a? Muse::Dl::Article + puts(thing) end end diff --git a/src/parser.cr b/src/parser.cr index 658b23b..ebfd2d7 100644 --- a/src/parser.cr +++ b/src/parser.cr @@ -10,7 +10,6 @@ module Muse::Dl @strip_first = true @output = DEFAULT_FILE_NAME @url : String | Nil - @input_pdf : String | Nil @clobber = false @input_list : String | Nil @cookie : String | Nil @@ -18,7 +17,7 @@ module Muse::Dl DEFAULT_FILE_NAME = "tempfilename.pdf" - getter :bookmarks, :tmp, :cleanup, :output, :url, :input_pdf, :clobber, :input_list, :cookie, :strip_first + getter :bookmarks, :tmp, :cleanup, :output, :url, :clobber, :input_list, :cookie, :strip_first setter :url # Update the output filename unless we have a custom one passed @@ -41,7 +40,6 @@ module Muse::Dl def initialize(arg : Array(String) = [] of String) @tmp = Dir.tempdir - @input_pdf = nil parser = OptionParser.new parser.banner = <<-EOT @@ -56,7 +54,6 @@ module Muse::Dl parser.on(long_flag = "--tmp-dir PATH", description = "Temporary Directory to use") { |path| @tmp = path } parser.on(long_flag = "--output FILE", description = "Output Filename") { |file| @output = file } parser.on(long_flag = "--no-bookmarks", description = "Don't add bookmarks in the PDF") { @bookmarks = false } - parser.on(long_flag = "--input-pdf INPUT", description = "Input Stitched PDF. Will not download anything") { |input| @input_pdf = input } parser.on(long_flag = "--clobber", description = "Overwrite the output file, if it already exists. Not compatible with input-pdf") { @clobber = true } parser.on(long_flag = "--dont-strip-first-page", description = "Disables first page from being stripped. Use carefully") { @strip_first = false } parser.on(long_flag = "--cookie COOKIE", description = "Cookie-header") { |cookie| @cookie = cookie } @@ -70,7 +67,6 @@ module Muse::Dl end if File.exists? args[0] @input_list = args[0] - @input_pdf = nil else @url = args[0] end From 870ed3080d7208fe251faac5d7a0985b07f1f3d5 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 14:47:51 +0530 Subject: [PATCH 13/19] Modular code in fetch to support both chapters and articles --- src/article.cr | 1 + src/fetch.cr | 50 +++++++++++++++++++++++++++++++++++++------------- src/muse-dl.cr | 16 +++++++++++++++- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/src/article.cr b/src/article.cr index 8da80b6..2d6e6a4 100644 --- a/src/article.cr +++ b/src/article.cr @@ -4,6 +4,7 @@ require "./issue.cr" module Muse::Dl class Article @id : String + getter :id def initialize(id : String) @id = id diff --git a/src/fetch.cr b/src/fetch.cr index acb96ab..ef15d29 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -14,6 +14,10 @@ module Muse::Dl "Connection" => "keep-alive", } + def self.article_file_name(id : String, tmp_path : String) + "#{tmp_path}/article-#{id}.pdf" + end + def self.chapter_file_name(id : String, tmp_path : String) "#{tmp_path}/chapter-#{id}.pdf" end @@ -23,24 +27,20 @@ module Muse::Dl File.delete(fns) if File.exists?(fns) end - def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, cookie : String | Nil = nil, add_bookmark = true, strip_first_page = true) - final_pdf_file = chapter_file_name chapter_id, tmp_path - tmp_pdf_file = "#{final_pdf_file}.tmp" - - if File.exists? final_pdf_file - puts "#{chapter_id} already downloaded" + def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true) + tmp_pdf_file = "#{file_name}.tmp" + if File.exists? file_name + puts "#{file_name} already downloaded" return end - # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class - url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" uri = URI.parse(url) http_client = HTTP::Client.new(uri) # Raise a IO::TimeoutError after 60 seconds. http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS headers = HEADERS.merge({ - "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", + "Referer" => referer, }) if cookie @@ -52,7 +52,7 @@ module Muse::Dl begin response = request.execute rescue ex : IO::TimeoutError - raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") + raise Muse::Dl::Errors::DownloadError.new("Error downloading #{url}. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") end # TODO: Add validation for the downloaded file (should be PDF) @@ -76,6 +76,7 @@ module Muse::Dl end end end + File.open(tmp_pdf_file, "w") do |file| file << response.body if file.size == 0 @@ -87,16 +88,39 @@ module Muse::Dl pdftk.strip_first_page tmp_pdf_file if strip_first_page - if add_bookmark + if bookmark_title # Run pdftk and add the bookmark to the file - pdftk.add_bookmark tmp_pdf_file, chapter_title.strip + pdftk.add_bookmark tmp_pdf_file, bookmark_title end # Now we can move the file to the proper PDF filename - File.rename tmp_pdf_file, final_pdf_file + File.rename tmp_pdf_file, file_name + end + + def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, cookie : String | Nil = nil, add_bookmark = true, strip_first_page = true) + final_pdf_file = chapter_file_name chapter_id, tmp_path + + if File.exists? final_pdf_file + puts "#{chapter_id} already downloaded" + return + end + + # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class + url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" + referer = "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf" + + save_url(url, referer, final_pdf_file, tmp_path, cookie, chapter_title, strip_first_page) + puts "Downloaded #{chapter_id}" end + def self.save_article(tmp_path : String, article_id : String, cookie : String | Nil = nil, article_title = nil, strip_first_page = true) + file_name = article_file_name article_id, tmp_path + url = "https://muse.jhu.edu/article/#{article_id}/pdf" + referer = "https://muse.jhu.edu/article/#{article_id}" + save_url(url, referer, file_name, tmp_path, cookie, article_title, strip_first_page) + end + def self.get_info(url : String) match = /https:\/\/muse.jhu.edu\/(book|journal|issue|article)\/(\d+)/.match url if match diff --git a/src/muse-dl.cr b/src/muse-dl.cr index a0534d8..26fd572 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -4,6 +4,7 @@ require "./fetch.cr" require "./book.cr" require "./journal.cr" require "./util.cr" +require "file_utils" module Muse::Dl VERSION = "1.1.2" @@ -56,7 +57,20 @@ module Muse::Dl end end elsif thing.is_a? Muse::Dl::Article - puts(thing) + # No bookmarks are needed since this is just a single article PDF + begin + Fetch.save_article(parser.tmp, thing.id, parser.cookie, nil, parser.strip_first) + rescue e : Muse::Dl::Errors::MuseCorruptPDF + STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" + return + end + + # TODO: Move this code elsewhere + source = Fetch.article_file_name(thing.id, parser.tmp) + destination = "article-#{thing.id}.pdf" + # Needed because of https://github.com/crystal-lang/crystal/issues/7777 + FileUtils.cp source, destination + FileUtils.rm source if parser.cleanup end end From 919c8ac43f3f41ccf0b54c8e5c566c7675739641 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 15:19:12 +0530 Subject: [PATCH 14/19] Fixes parser for issue HTML This also adds .journal_title as an attribute to the Issue object --- spec/fixtures/issue-35852.html | 1263 ++++++++++++++++++++++++++++++++ spec/issue_spec.cr | 21 + src/article.cr | 4 +- src/issue.cr | 56 +- src/muse-dl.cr | 38 + 5 files changed, 1361 insertions(+), 21 deletions(-) create mode 100644 spec/fixtures/issue-35852.html diff --git a/spec/fixtures/issue-35852.html b/spec/fixtures/issue-35852.html new file mode 100644 index 0000000..7e50f60 --- /dev/null +++ b/spec/fixtures/issue-35852.html @@ -0,0 +1,1263 @@ + + + + + + + + + + + + + + + Project MUSE - Constitutional Studies-Volume 1, Issue 2, 2016 + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + + + + + + +
+
+ +
+ + + +
+ + + MUSE Alert +
+ + + + + +
+

In this Issue

+
+ +
+ + + +
+ + +
+
+
+

Table of Contents

+
+ + + +
+ +
+
+ + + +
+ + + + + +
+
+
    + +
  1. + The Limits of Veneration: Public Support for a New Constitutional Convention + +
  2. + + +
  3. + + William D. Blake, + + Sanford V. Levinson + +
  4. + + + +
  5. pp. 1-22
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Secession and Nullification as a Global Trend + +
  2. + + +
  3. + + Ran Hirschl + +
  4. + + + +
  5. pp. 23-40
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Challenging Constitutionalism in Post-Apartheid South Africa + +
  2. + + +
  3. + + Heinz Klug + +
  4. + + + +
  5. pp. 41-58
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Democracy by Lawsuit: Or, Can Litigation Alleviate the European Union’s “Democratic Deficit?” + +
  2. + + +
  3. + + Tommaso Pavone + +
  4. + + + +
  5. pp. 59-80
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Private Enforcement of Constitutional Guarantees in the Ku Klux Act of 1871 + +
  2. + + +
  3. + + Paul J. Gardner + +
  4. + + + +
  5. pp. 81-95
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Sober Second Thoughts: Evaluating the History of Horizontal Judicial Review by the U.S. Supreme Court + +
  2. + + +
  3. + + Keith E. Whittington + +
  4. + + + +
  5. pp. 97-116
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + + + +
+
+ +
+
+ + + + + + + +
+ + + + + + + + +
+
+
+
+ +
+

Previous Issue

+
+

Volume 1, Issue 1, 2016

+ +
+
+ +
+
+ +
+
+
+

Additional Information

+
+
+ +
+
+ ISSN +
+
+ 2474-9419 +
+
+ + +
+
+ Print ISSN +
+
+ 2474-9427 +
+
+ + + + + + + + +
+
+ Launched on MUSE +
+
+ 2017-02-21 +
+
+ + + + +
+
+ Open Access +
+
+ + Yes + +
+
+ + + + + + +
+
+
+ + +
+

Copyright

+
+ + + + + +
+ + + +
+

Additional Issue Materials

+
+ + +
+ + +
+
+ + + + +
+
+ + + + + + + + + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + + diff --git a/spec/issue_spec.cr b/spec/issue_spec.cr index aa0d11d..2809dd1 100644 --- a/spec/issue_spec.cr +++ b/spec/issue_spec.cr @@ -37,4 +37,25 @@ describe Muse::Dl::Issue do it "should parse publisher" do issue.publisher.should eq "Johns Hopkins University Press" end + it "should parse the journal title" do + issue.journal_title.should eq "portal: Libraries and the Academy" + end + + it "should parse non-numbered issues" do + WebMock.stub(:get, "https://muse.jhu.edu/issue/35852") + .to_return(body: File.new("spec/fixtures/issue-35852.html").gets_to_end) + issue = Muse::Dl::Issue.new "35852" + issue.parse + + issue.volume.should eq "1" + issue.number.should eq "2" + issue.date.should eq "2016" + + issue.info["ISSN"].should eq "2474-9419" + issue.info["Print ISSN"].should eq "2474-9427" + issue.info["Launched on MUSE"].should eq "2017-02-21" + issue.info["Open Access"].should eq "Yes" + issue.title.should eq "Volume 1, Issue 2, 2016" + issue.journal_title.should eq "Constitutional Studies" + end end diff --git a/src/article.cr b/src/article.cr index 2d6e6a4..ef9377f 100644 --- a/src/article.cr +++ b/src/article.cr @@ -3,8 +3,8 @@ require "./issue.cr" module Muse::Dl class Article - @id : String - getter :id + getter id : String + setter title : String | Nil, start_page : Int32 | Nil, end_page : Int32 | Nil def initialize(id : String) @id = id diff --git a/src/issue.cr b/src/issue.cr index 2e826f7..4a0c97a 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -1,29 +1,26 @@ -require "./thing.cr" +"./thing.cr" require "./fetch.cr" require "./article.cr" module Muse::Dl class Issue - @id : String - @title : String | Nil - @articles : Array(Muse::Dl::Article) - @url : String - @info : Hash(String, String) - @summary : String | Nil - @publisher : String | Nil - @volume : String | Nil - @number : String | Nil - @date : String | Nil - @issues : Array(Muse::Dl::Issue) - - getter :id, :title, :articles, :url, :summary, :publisher, :info, :volume, :number, :date + getter id : String, + title : String | Nil, + articles : Array(Muse::Dl::Article), + url : String, + summary : String | Nil, + publisher : String | Nil, + info : Hash(String, String), + volume : String | Nil, + number : String | Nil, + date : String | Nil, + journal_title : String | Nil def initialize(id : String) @id = id @url = "https://muse.jhu.edu/issue/#{id}" @info = Hash(String, String).new @articles = [] of Muse::Dl::Article - @issues = [] of Muse::Dl::Issue end def parse @@ -42,17 +39,38 @@ module Muse::Dl unless t.nil? @volume = /Volume (\d+)/.match(t).try &.[1] @number = /Number (\d+)/.match(t).try &.[1] + @number = /Issue (\d+)/.match(t).try &.[1] unless @number @date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1] + @date = /(\d{4})/.match(t).try &.[1] unless @date end end def parse_contents(myhtml : Myhtml::Parser) - myhtml.css("#available_issues_list_text a").each do |a| - link = a.attribute_by("href").to_s + journal_title_a = myhtml.css("#journal_banner_title a").first + if journal_title_a + @journal_title = journal_title_a.inner_text + end + myhtml.css(".articles_list_text ol").each do |ol| + link = ol.css("li.title a").first + title = link.inner_text - matches = /\/issue\/(\d+)/.match link + pages = ol.css("li.pg").first.try &.inner_text + matches = /(\d+)-(\d+)/.match pages if matches - @issues.push Muse::Dl::Issue.new matches[1] + start_page = matches[1].to_i + end_page = matches[2].to_i + end + + ol.css("a").each do |l| + url = l.attribute_by("href").to_s + matches = /\/article\/(\d+)\/pdf/.match url + if matches + a = Muse::Dl::Article.new matches[1] + a.title = title + a.start_page = start_page if start_page + a.end_page = end_page if end_page + @articles.push a + end end end end diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 26fd572..656dab1 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -71,6 +71,44 @@ module Muse::Dl # Needed because of https://github.com/crystal-lang/crystal/issues/7777 FileUtils.cp source, destination FileUtils.rm source if parser.cleanup + elsif thing.is_a? Muse::Dl::Issue + # Will have no effect if parser has a custom title + parser.output = Util.slug_filename "#{thing.title}.pdf" + + # If file exists and we can't clobber + if File.exists?(parser.output) && parser.clobber == false + STDERR.puts "Skipping #{url}, File already exists: #{parser.output}" + return + end + temp_stitched_file = nil + pdf_builder = Pdftk.new(parser.tmp) + + # ## TODO till 111 + thing.issues.each do |issue| + begin + Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first) + rescue e : Muse::Dl::Errors::MuseCorruptPDF + STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" + return + end + end + chapter_ids = thing.chapters.map { |c| c[0] } + + # Stitch the PDFs together + temp_stitched_file = pdf_builder.stitch chapter_ids + pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) + + temp_stitched_file.delete if temp_stitched_file + puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first + puts "DL: #{url}. Saved final output to #{parser.output}" + + # Cleanup the chapter files + if parser.cleanup + thing.chapters.each do |c| + Fetch.cleanup(parser.tmp, c[0]) + end + end + #### end end From 38db0dd000fd62b77be6fdc0b66014bf3e238ebd Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 16:50:49 +0530 Subject: [PATCH 15/19] Adds tests for page detection --- spec/issue_spec.cr | 24 ++++++++++++++++++++++++ src/article.cr | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/spec/issue_spec.cr b/spec/issue_spec.cr index 2809dd1..5a42f69 100644 --- a/spec/issue_spec.cr +++ b/spec/issue_spec.cr @@ -57,5 +57,29 @@ describe Muse::Dl::Issue do issue.info["Open Access"].should eq "Yes" issue.title.should eq "Volume 1, Issue 2, 2016" issue.journal_title.should eq "Constitutional Studies" + + expected_pages = [ + [1, 22], + [23, 40], + [41, 58], + [59, 80], + [81, 95], + [97, 116], + ] + + expected_titles = [ + "The Limits of Veneration: Public Support for a New Constitutional Convention", + "Secession and Nullification as a Global Trend", + "Challenging Constitutionalism in Post-Apartheid South Africa", + "Democracy by Lawsuit: Or, Can Litigation Alleviate the European Union’s “Democratic Deficit?”", + "Private Enforcement of Constitutional Guarantees in the Ku Klux Act of 1871", + "Sober Second Thoughts: Evaluating the History of Horizontal Judicial Review by the U.S. Supreme Court", + ] + + issue.articles.each_with_index do |a, i| + a.start_page.should eq expected_pages[i][0] + a.end_page.should eq expected_pages[i][1] + a.title.should eq expected_titles[i] + end end end diff --git a/src/article.cr b/src/article.cr index ef9377f..64a7c39 100644 --- a/src/article.cr +++ b/src/article.cr @@ -3,7 +3,7 @@ require "./issue.cr" module Muse::Dl class Article - getter id : String + getter id : String, :start_page, :end_page, :title setter title : String | Nil, start_page : Int32 | Nil, end_page : Int32 | Nil def initialize(id : String) From 62e6a21c84695786e64f7aa8ab51866e2e5c99a7 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 17:36:44 +0530 Subject: [PATCH 16/19] Finishes support for downloading complete issues --- src/errors/missing_chapter.cr | 4 -- src/errors/missing_file.cr | 4 ++ src/fetch.cr | 2 +- src/infoparser.cr | 6 ++- src/issue.cr | 8 ++-- src/muse-dl.cr | 28 ++++++------ src/pdftk.cr | 86 +++++++++++++++++++++++++++++++---- 7 files changed, 107 insertions(+), 31 deletions(-) delete mode 100644 src/errors/missing_chapter.cr create mode 100644 src/errors/missing_file.cr diff --git a/src/errors/missing_chapter.cr b/src/errors/missing_chapter.cr deleted file mode 100644 index b33487f..0000000 --- a/src/errors/missing_chapter.cr +++ /dev/null @@ -1,4 +0,0 @@ -module Muse::Dl::Errors - class MissingChapter < Exception - end -end diff --git a/src/errors/missing_file.cr b/src/errors/missing_file.cr new file mode 100644 index 0000000..f11f6fd --- /dev/null +++ b/src/errors/missing_file.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class MissingFile < Exception + end +end diff --git a/src/fetch.cr b/src/fetch.cr index ef15d29..963c894 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -132,7 +132,7 @@ module Muse::Dl when "journal" return Muse::Dl::Journal.new response when "issue" - return Muse::Dl::Issue.new response + return Muse::Dl::Issue.new match[2], response when "article" return Muse::Dl::Article.new match[2] end diff --git a/src/infoparser.cr b/src/infoparser.cr index 836c9e3..8f10d28 100644 --- a/src/infoparser.cr +++ b/src/infoparser.cr @@ -35,7 +35,11 @@ module Muse::Dl end def self.issue_title(myhtml : Myhtml::Parser) - myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip + begin + myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip + rescue + nil + end end def self.author(myhtml : Myhtml::Parser) diff --git a/src/issue.cr b/src/issue.cr index 4a0c97a..22c8dae 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -16,15 +16,15 @@ module Muse::Dl date : String | Nil, journal_title : String | Nil - def initialize(id : String) + def initialize(id : String, response : String | Nil = nil) @id = id @url = "https://muse.jhu.edu/issue/#{id}" - @info = Hash(String, String).new @articles = [] of Muse::Dl::Article + parse(response) if response + @info = Hash(String, String).new end - def parse - html = Crest.get(url).to_s + def parse(html : String) h = Myhtml::Parser.new html @info = InfoParser.infobox(h) @title = InfoParser.issue_title(h) diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 656dab1..07e6f25 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -47,7 +47,7 @@ module Muse::Dl pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) temp_stitched_file.delete if temp_stitched_file - puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first + puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first puts "DL: #{url}. Saved final output to #{parser.output}" # Cleanup the chapter files @@ -73,7 +73,7 @@ module Muse::Dl FileUtils.rm source if parser.cleanup elsif thing.is_a? Muse::Dl::Issue # Will have no effect if parser has a custom title - parser.output = Util.slug_filename "#{thing.title}.pdf" + parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf" # If file exists and we can't clobber if File.exists?(parser.output) && parser.clobber == false @@ -84,30 +84,32 @@ module Muse::Dl pdf_builder = Pdftk.new(parser.tmp) # ## TODO till 111 - thing.issues.each do |issue| + thing.articles.each do |article| begin - Fetch.save_issue(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first) + Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first) rescue e : Muse::Dl::Errors::MuseCorruptPDF STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" return end end - chapter_ids = thing.chapters.map { |c| c[0] } + article_ids = thing.articles.map { |a| a.id } # Stitch the PDFs together - temp_stitched_file = pdf_builder.stitch chapter_ids + temp_stitched_file = pdf_builder.stitch_articles article_ids + # TODO: Add metadata for each Issue pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) - temp_stitched_file.delete if temp_stitched_file - puts "--dont-strip-first-page was on. Please validate PDF file for any errors." if parser.strip_first + # temp_stitched_file.delete if temp_stitched_file + puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first puts "DL: #{url}. Saved final output to #{parser.output}" # Cleanup the chapter files - if parser.cleanup - thing.chapters.each do |c| - Fetch.cleanup(parser.tmp, c[0]) - end - end + # TODO + # if parser.cleanup + # thing.articles.each do |c| + # Fetch.cleanup(parser.tmp, c[0]) + # end + # end #### end end diff --git a/src/pdftk.cr b/src/pdftk.cr index 4b5a01c..41ebcf0 100644 --- a/src/pdftk.cr +++ b/src/pdftk.cr @@ -70,7 +70,6 @@ module Muse::Dl def add_metadata(input_file : File, output_file : String, book : Book) # First we have to dump the current metadata - metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") keywords = "Publisher:#{book.publisher}, Published:#{book.date}" # Known Info keys, if they are present @@ -80,7 +79,12 @@ module Muse::Dl end end - text = <<-EOT + metadata_text = gen_metadata(book.title, keywords, book.summary.gsub(/\n\s+/, " "), book.author) + write_metadata(input_file, output_file, metadata_text) + end + + def gen_metadata(title : String, keywords : String, subject : String, author : String | Nil = nil) + metadata = <<-EOT InfoBegin InfoKey: Creator InfoValue: @@ -89,25 +93,37 @@ module Muse::Dl InfoValue: InfoBegin InfoKey: Title - InfoValue: #{book.title} + InfoValue: #{title} InfoBegin InfoKey: Keywords InfoValue: #{keywords} InfoBegin - InfoKey: Author - InfoValue: #{book.author} - InfoBegin InfoKey: Subject - InfoValue: #{book.summary.gsub(/\n\s+/, " ")} + InfoValue: #{subject} InfoBegin InfoKey: ModDate InfoValue: InfoBegin InfoKey: CreationDate InfoValue: + EOT + unless author.nil? + metadata += <<-EOT + InfoBegin + InfoKey: Author + InfoValue: #{author} + EOT + end + + return metadata + end + + def write_metadata(input_file : File, output_file : String, text) + metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") File.write(metadata_text_file.path, text) + is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file] if !is_success raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.") @@ -115,11 +131,42 @@ module Muse::Dl metadata_text_file.delete end + def add_metadata(input_file : File, output_file : String, issue : Issue) + # First we have to dump the current metadata + metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") + keywords = "Journal:#{issue.journal_title}, Published:#{issue.date},Volume:#{issue.volume},Number:#{issue.number}" + ["ISSN", "Print ISSN", "DOI", "Language", "Open Access"].each do |label| + if issue.info.has_key? label + keywords += ", #{label}:#{issue.info[label]}" + end + end + + # TODO: Move this to Issue class + + s = issue.summary + unless s.nil? + summary = s.gsub(/\n\s+/, " ") + else + summary = "NA" + end + + t = issue.title + + unless t.nil? + title = t + else + title = "NA" + end + # TODO: Add support for all authors in the PDF + metadata = gen_metadata(title, keywords, summary) + write_metadata(input_file, output_file, metadata) + end + def stitch(chapter_ids : Array(String)) output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf") # Do some sanity checks on each Chapter PDF chapter_ids.each do |id| - raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path) + raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path) raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0 end @@ -136,5 +183,28 @@ module Muse::Dl return output_file end + + # TODO: Merge with stitch + def stitch_articles(article_ids : Array(String)) + output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf") + # Do some sanity checks on each Chapter PDF + article_ids.each do |id| + raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.article_file_name(id, @tmp_file_path) + raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.article_file_name(id, @tmp_file_path)) > 0 + end + + # Now let's stitch them together + article_files = article_ids.map { |id| Fetch.article_file_name(id, @tmp_file_path) } + args = article_files + ["cat", "output", output_file.path] + is_success = execute args + + # TODO: Validate final file here + if !is_success + puts args + raise Muse::Dl::Errors::PDFOperationError.new("Error stitching articles together.") + end + + return output_file + end end end From 3a2d45fb6ee2f727e5f839f1323bbff53a81691f Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 17:59:56 +0530 Subject: [PATCH 17/19] Adds a skip-open-access flag --- src/article.cr | 5 +++++ src/issue.cr | 7 +++++++ src/journal.cr | 7 +++++++ src/muse-dl.cr | 5 +++++ src/parser.cr | 4 +++- src/thing.cr | 7 +++++++ 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/article.cr b/src/article.cr index 64a7c39..bf6ac1c 100644 --- a/src/article.cr +++ b/src/article.cr @@ -10,5 +10,10 @@ module Muse::Dl @id = id @url = "https://muse.jhu.edu/article/#{id}" end + + # TODO: Fix this + def open_access + return false + end end end diff --git a/src/issue.cr b/src/issue.cr index 22c8dae..8f759d2 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -24,6 +24,13 @@ module Muse::Dl @info = Hash(String, String).new end + def open_access + if @info.has_key? "Open Access" + return @info["Open Access"] == "Yes" + end + false + end + def parse(html : String) h = Myhtml::Parser.new html @info = InfoParser.infobox(h) diff --git a/src/journal.cr b/src/journal.cr index 9c828f6..45b6214 100644 --- a/src/journal.cr +++ b/src/journal.cr @@ -19,6 +19,13 @@ module Muse::Dl parse_volumes(h) end + def open_access + if @info.has_key? "Open Access" + return @info["Open Access"] == "Yes" + end + false + end + def parse_volumes(myhtml : Myhtml::Parser) myhtml.css("#available_issues_list_text a").each do |a| link = a.attribute_by("href").to_s diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 07e6f25..47095ab 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -15,6 +15,11 @@ module Muse::Dl thing = Fetch.get_info(url) if url return unless thing + if (thing.open_access) && (parser.skip_oa) + STDERR.puts "Skipping #{url}, available under Open Access" + return + end + if thing.is_a? Muse::Dl::Book unless thing.formats.includes? :pdf STDERR.puts "Book not available in PDF format, skipping: #{url}" diff --git a/src/parser.cr b/src/parser.cr index ebfd2d7..a98d27e 100644 --- a/src/parser.cr +++ b/src/parser.cr @@ -14,10 +14,11 @@ module Muse::Dl @input_list : String | Nil @cookie : String | Nil @h : Bool | Nil + @skip_oa = false DEFAULT_FILE_NAME = "tempfilename.pdf" - getter :bookmarks, :tmp, :cleanup, :output, :url, :clobber, :input_list, :cookie, :strip_first + getter :bookmarks, :tmp, :cleanup, :output, :url, :clobber, :input_list, :cookie, :strip_first, :skip_oa setter :url # Update the output filename unless we have a custom one passed @@ -57,6 +58,7 @@ module Muse::Dl parser.on(long_flag = "--clobber", description = "Overwrite the output file, if it already exists. Not compatible with input-pdf") { @clobber = true } parser.on(long_flag = "--dont-strip-first-page", description = "Disables first page from being stripped. Use carefully") { @strip_first = false } parser.on(long_flag = "--cookie COOKIE", description = "Cookie-header") { |cookie| @cookie = cookie } + parser.on(long_flag = "--skip-open-access", description = "Don't download open access content") { @skip_oa = true } parser.on("-h", "--help", "Show this help") { @h = true; puts parser } parser.unknown_args do |args| diff --git a/src/thing.cr b/src/thing.cr index 96a105a..2b0e3fd 100644 --- a/src/thing.cr +++ b/src/thing.cr @@ -19,6 +19,13 @@ module Muse::Dl private getter :h + def open_access + if @info.has_key? "Open Access" + return @info["Open Access"] == "Yes" + end + false + end + def initialize(html : String) @h = Myhtml::Parser.new html @info = InfoParser.infobox(h) From 03fccde754606a5d520e9bba6d215c7450898cd4 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 30 Jun 2020 18:36:01 +0530 Subject: [PATCH 18/19] Adds support for final journal downloads --- src/fetch.cr | 5 +++++ src/infoparser.cr | 4 ++++ src/issue.cr | 30 +++++++++++++++++++++--------- src/journal.cr | 8 ++++++-- src/muse-dl.cr | 26 +++++++++++++++----------- src/parser.cr | 4 ++++ 6 files changed, 55 insertions(+), 22 deletions(-) diff --git a/src/fetch.cr b/src/fetch.cr index 963c894..213bdc5 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -27,6 +27,11 @@ module Muse::Dl File.delete(fns) if File.exists?(fns) end + def self.cleanup_articles(tmp_path : String, id : String) + fns = article_file_name(id, tmp_path) + File.delete(fns) if File.exists?(fns) + end + def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true) tmp_pdf_file = "#{file_name}.tmp" if File.exists? file_name diff --git a/src/infoparser.cr b/src/infoparser.cr index 8f10d28..7e331db 100644 --- a/src/infoparser.cr +++ b/src/infoparser.cr @@ -42,6 +42,10 @@ module Muse::Dl end end + def self.journal_title(myhtml : Myhtml::Parser) + myhtml.css("#journal_about_info .title").map(&.inner_text).to_a[0].strip + end + def self.author(myhtml : Myhtml::Parser) myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("
", ", ").gsub("\n", " ") end diff --git a/src/issue.cr b/src/issue.cr index 8f759d2..73a83fb 100644 --- a/src/issue.cr +++ b/src/issue.cr @@ -16,6 +16,8 @@ module Muse::Dl date : String | Nil, journal_title : String | Nil + setter :journal_title + def initialize(id : String, response : String | Nil = nil) @id = id @url = "https://muse.jhu.edu/issue/#{id}" @@ -31,6 +33,11 @@ module Muse::Dl false end + def parse + html = Crest.get(@url).to_s + parse(html) + end + def parse(html : String) h = Myhtml::Parser.new html @info = InfoParser.infobox(h) @@ -47,25 +54,30 @@ module Muse::Dl @volume = /Volume (\d+)/.match(t).try &.[1] @number = /Number (\d+)/.match(t).try &.[1] @number = /Issue (\d+)/.match(t).try &.[1] unless @number - @date = /((January|February|March|April|May|June|July|August|September|October|November|December) (\d+))/.match(t).try &.[1] + @date = /((January|February|March|April|May|June|July|August|September|October|November|December|Sring|Winter|Fall|Summer) (\d+))/.match(t).try &.[1] @date = /(\d{4})/.match(t).try &.[1] unless @date end end def parse_contents(myhtml : Myhtml::Parser) - journal_title_a = myhtml.css("#journal_banner_title a").first - if journal_title_a - @journal_title = journal_title_a.inner_text + unless @journal_title + journal_title_a = myhtml.css("#journal_banner_title a").first + if journal_title_a + @journal_title = journal_title_a.inner_text + end end myhtml.css(".articles_list_text ol").each do |ol| link = ol.css("li.title a").first title = link.inner_text - pages = ol.css("li.pg").first.try &.inner_text - matches = /(\d+)-(\d+)/.match pages - if matches - start_page = matches[1].to_i - end_page = matches[2].to_i + pages = ol.css("li.pg") + if pages.size > 0 + p = pages.first.try &.inner_text + matches = /(\d+)-(\d+)/.match p + if matches + start_page = matches[1].to_i + end_page = matches[2].to_i + end end ol.css("a").each do |l| diff --git a/src/journal.cr b/src/journal.cr index 45b6214..d431824 100644 --- a/src/journal.cr +++ b/src/journal.cr @@ -3,11 +3,12 @@ require "./issue.cr" module Muse::Dl class Journal - getter :info, :summary, :publisher, :issues + getter :info, :summary, :publisher, :issues, :title @info = Hash(String, String).new @summary : String @publisher : String @issues = [] of Muse::Dl::Issue + @title : String private getter :h @@ -16,6 +17,7 @@ module Muse::Dl @info = InfoParser.infobox(h) @summary = InfoParser.summary(h) @publisher = InfoParser.journal_publisher(h) + @title = InfoParser.journal_title(h) parse_volumes(h) end @@ -32,7 +34,9 @@ module Muse::Dl matches = /\/issue\/(\d+)/.match link if matches - @issues.push Muse::Dl::Issue.new matches[1] + issue = Muse::Dl::Issue.new matches[1] + issue.journal_title = @title + @issues.push issue end end end diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 47095ab..19693a8 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -12,6 +12,7 @@ module Muse::Dl class Main def self.dl(parser : Parser) url = parser.url + puts "Downloading #{url}" thing = Fetch.get_info(url) if url return unless thing @@ -78,7 +79,7 @@ module Muse::Dl FileUtils.rm source if parser.cleanup elsif thing.is_a? Muse::Dl::Issue # Will have no effect if parser has a custom title - parser.output = Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf" + parser.force_set_output Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf" # If file exists and we can't clobber if File.exists?(parser.output) && parser.clobber == false @@ -88,7 +89,6 @@ module Muse::Dl temp_stitched_file = nil pdf_builder = Pdftk.new(parser.tmp) - # ## TODO till 111 thing.articles.each do |article| begin Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first) @@ -101,21 +101,25 @@ module Muse::Dl # Stitch the PDFs together temp_stitched_file = pdf_builder.stitch_articles article_ids - # TODO: Add metadata for each Issue pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) # temp_stitched_file.delete if temp_stitched_file puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first puts "DL: #{url}. Saved final output to #{parser.output}" - # Cleanup the chapter files - # TODO - # if parser.cleanup - # thing.articles.each do |c| - # Fetch.cleanup(parser.tmp, c[0]) - # end - # end - #### + # Cleanup the issue files + if parser.cleanup + thing.articles.each do |a| + Fetch.cleanup_articles(parser.tmp, a.id) + end + end + elsif thing.is_a? Muse::Dl::Journal + thing.issues.each do |issue| + # Update the issue + issue.parse + parser.url = issue.url + Main.dl parser + end end end diff --git a/src/parser.cr b/src/parser.cr index a98d27e..e462445 100644 --- a/src/parser.cr +++ b/src/parser.cr @@ -26,6 +26,10 @@ module Muse::Dl @output = output_file unless @output != DEFAULT_FILE_NAME end + def force_set_output(output_file : String) + @output = output_file + end + def reset_output_file @output = DEFAULT_FILE_NAME end From a05a1253db7cae57146a34ff451700f6275189cf Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 1 Jul 2020 18:26:48 +0530 Subject: [PATCH 19/19] Keep going with next issue --- src/muse-dl.cr | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 19693a8..93ccced 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -115,10 +115,15 @@ module Muse::Dl end elsif thing.is_a? Muse::Dl::Journal thing.issues.each do |issue| - # Update the issue - issue.parse - parser.url = issue.url - Main.dl parser + begin + # Update the issue + issue.parse + parser.url = issue.url + Main.dl parser + rescue e + puts e.message + puts "Faced an exception with previous issue, continuing" + end end end end