diff --git a/Makefile b/Makefile index 98a984e..5dcf3dd 100644 --- a/Makefile +++ b/Makefile @@ -7,4 +7,7 @@ release: # Then extract the image | extract the layer.tar file (we only have one layer) | extract the muse-dl-static file docker image save muse-dl-static | tar xf - --wildcards "*/layer.tar" -O | tar xf - "muse-dl-static" # And move it to the bin/ directory - mv -f muse-dl-static bin/ \ No newline at end of file + mv -f muse-dl-static bin/ + +test: + crystal spec diff --git a/shard.lock b/shard.lock index 9984f5f..80ce5b8 100644 --- a/shard.lock +++ b/shard.lock @@ -1,14 +1,18 @@ -version: 1.0 +version: 2.0 shards: crest: - github: mamantoha/crest - version: 0.24.1 + git: https://github.com/mamantoha/crest.git + version: 0.25.1 http-client-digest_auth: - github: mamantoha/http-client-digest_auth - version: 0.3.0 + git: https://github.com/mamantoha/http-client-digest_auth.git + version: 0.4.0 myhtml: - github: kostya/myhtml + git: https://github.com/kostya/myhtml.git version: 1.5.1 + webmock: + git: https://github.com/manastech/webmock.cr.git + version: 0.13.0+git.commit.bb3eab30f6c7d1fdc0a7ff14cd136d68e860d1a7 + diff --git a/shard.yml b/shard.yml index 37050c5..4e42cbc 100644 --- a/shard.yml +++ b/shard.yml @@ -15,4 +15,9 @@ dependencies: myhtml: github: kostya/myhtml crest: - github: mamantoha/crest \ No newline at end of file + github: mamantoha/crest + +development_dependencies: + webmock: + github: manastech/webmock.cr + branch: master \ No newline at end of file diff --git a/spec/fetch_spec.cr b/spec/fetch_spec.cr index 6e97dfd..abe1a72 100644 --- a/spec/fetch_spec.cr +++ b/spec/fetch_spec.cr @@ -1,7 +1,12 @@ require "./spec_helper" +require "webmock" # require "errors/muse_corrupt_pdf.cr" describe Muse::Dl::Book do + headers = {"Content-Type" => "text/html"} + WebMock.stub(:get, "https://muse.jhu.edu/chapter/2379787/pdf") + .to_return(body_io: File.new("spec/fixtures/chapter-2379787.html"), headers: headers) + it "should notice the unable to construct chapter PDF error" do f = "/tmp/chapter-2379787.pdf" File.delete(f) if File.exists? f diff --git a/spec/fixtures/chapter-2379787.html b/spec/fixtures/chapter-2379787.html new file mode 100644 index 0000000..2e5e3c2 --- /dev/null +++ b/spec/fixtures/chapter-2379787.html @@ -0,0 +1,359 @@ + + + + + + + + + + + + + + + + + Project MUSE + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + +
+ +
+ ErrorUnable to construct chapter PDF + +
+
+ + + + + + + +
+ Back To Top +
+ + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + + diff --git a/spec/fixtures/issue-35852.html b/spec/fixtures/issue-35852.html new file mode 100644 index 0000000..7e50f60 --- /dev/null +++ b/spec/fixtures/issue-35852.html @@ -0,0 +1,1263 @@ + + + + + + + + + + + + + + + Project MUSE - Constitutional Studies-Volume 1, Issue 2, 2016 + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + + + + + + +
+
+ +
+ + + +
+ + + MUSE Alert +
+ + + + + +
+

In this Issue

+
+ +
+ + + +
+ + +
+
+
+

Table of Contents

+
+ + + +
+ +
+
+ + + +
+ + + + + +
+
+
    + +
  1. + The Limits of Veneration: Public Support for a New Constitutional Convention + +
  2. + + +
  3. + + William D. Blake, + + Sanford V. Levinson + +
  4. + + + +
  5. pp. 1-22
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Secession and Nullification as a Global Trend + +
  2. + + +
  3. + + Ran Hirschl + +
  4. + + + +
  5. pp. 23-40
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Challenging Constitutionalism in Post-Apartheid South Africa + +
  2. + + +
  3. + + Heinz Klug + +
  4. + + + +
  5. pp. 41-58
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Democracy by Lawsuit: Or, Can Litigation Alleviate the European Union’s “Democratic Deficit?” + +
  2. + + +
  3. + + Tommaso Pavone + +
  4. + + + +
  5. pp. 59-80
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Private Enforcement of Constitutional Guarantees in the Ku Klux Act of 1871 + +
  2. + + +
  3. + + Paul J. Gardner + +
  4. + + + +
  5. pp. 81-95
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + +
+
+
    + +
  1. + Sober Second Thoughts: Evaluating the History of Horizontal Judicial Review by the U.S. Supreme Court + +
  2. + + +
  3. + + Keith E. Whittington + +
  4. + + + +
  5. pp. 97-116
  6. + +
  7. +
    + + +open access + + + + contents +
    + + + +
  8. + + +
+
+
+ + + + + +
+
+ +
+
+ + + + + + + +
+ + + + + + + + +
+
+
+
+ +
+

Previous Issue

+
+

Volume 1, Issue 1, 2016

+ +
+
+ +
+
+ +
+
+
+

Additional Information

+
+
+ +
+
+ ISSN +
+
+ 2474-9419 +
+
+ + +
+
+ Print ISSN +
+
+ 2474-9427 +
+
+ + + + + + + + +
+
+ Launched on MUSE +
+
+ 2017-02-21 +
+
+ + + + +
+
+ Open Access +
+
+ + Yes + +
+
+ + + + + + +
+
+
+ + +
+

Copyright

+
+ + + + + +
+ + + +
+

Additional Issue Materials

+
+ + +
+ + +
+
+ + + + +
+
+ + + + + + +
+ Back To Top +
+ + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + + diff --git a/spec/fixtures/issue-41793.html b/spec/fixtures/issue-41793.html new file mode 100644 index 0000000..753844a --- /dev/null +++ b/spec/fixtures/issue-41793.html @@ -0,0 +1,1603 @@ + + + + + + + + + + + + + + + Project MUSE - portal: Libraries and the Academy-Volume 20, Number 1, January 2020 + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + + + + + + +
+
+ +
+ + + +
+ + + MUSE Alert +
+ + + + + +
+

In this Issue

+
+ +
+ + + +
+ + +
+
+
+

Table of Contents

+
+ + + +
+ +
+
+ + + +
+ + + + + +
+
+
    + +
  1. + Coming of Age: portal at 20 + +
  2. + + +
  3. + + Marianne Ryan + +
  4. + + + +
  5. pp. 1-5
  6. +
  7. DOI: 10.1353/pla.2020.0000
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + Techniques to Imagine, Fund, and Build the Academic Library of Your Dreams + +
  2. + + +
  3. + + Janette S. Blackburn + +
  4. + + + +
  5. pp. 7-14
  6. +
  7. DOI: 10.1353/pla.2020.0001
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + The International Open Access Movement and Its Status in Pakistan + +
  2. + + +
  3. + + Arslan Sheikh + +
  4. + + + +
  5. pp. 15-31
  6. +
  7. DOI: 10.1353/pla.2020.0002
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + Big Data and Academic Libraries: The Quest for Informed Decision-Making + +
  2. + + +
  3. + + Tiffini A. Travis, + + Christian Ramirez + +
  4. + + + +
  5. pp. 33-47
  6. +
  7. DOI: 10.1353/pla.2020.0003
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + From Service Role to Partnership: Faculty Voices on Collaboration with Librarians + +
  2. + + +
  3. + + Maria A. Perez-Stable, + + Judith M. Arnold, + + LuMarie F. Guth, + + Patricia Fravel Vander Meer + +
  4. + + + +
  5. pp. 49-72
  6. +
  7. DOI: 10.1353/pla.2020.0004
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + Attitudes of North American Academics toward Open Access Scholarly Journals + +
  2. + + +
  3. + + Elizabeth D. Dalton, + + Carol Tenopir, + + Bo-Christer Björk + +
  4. + + + +
  5. pp. 73-100
  6. +
  7. DOI: 10.1353/pla.2020.0005
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + A Multi-Method Information Literacy Assessment Program: Foundation and Early Results + +
  2. + + +
  3. + + William H. Walters, + + Sarah E. Sheehan, + + Amy E. Handfield, + + Bernadette M. López-Fitzsimmons, + + Susanne Markgren, + + Laurin Paradise + +
  4. + + + +
  5. pp. 101-135
  6. +
  7. DOI: 10.1353/pla.2020.0006
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + The Problem with Grit: Dismantling Deficit Thinking in Library Instruction + +
  2. + + +
  3. + + Eamon Tewell + +
  4. + + + +
  5. pp. 137-159
  6. +
  7. DOI: 10.1353/pla.2020.0007
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + Apprenticing Researchers: Exploring Upper-Division Students' Information Literacy Competencies + +
  2. + + +
  3. + + Sara L. Davidson Squibb, + + Anne Zanzucchi + +
  4. + + + +
  5. pp. 161-185
  6. +
  7. DOI: 10.1353/pla.2020.0008
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + +
+
+
    + +
  1. + E-Book Information Behaviors and Formats among Graduate Students in Information Sciences + +
  2. + + +
  3. + + Daniel G. Tracy + +
  4. + + + +
  5. pp. 187-220
  6. +
  7. DOI: 10.1353/pla.2020.0009
  8. +
  9. +
    + + +free access + + + + +
    + + +
  10. + + +
+
+
+ + + + + +
+
+ +
+
+ + + + + + + +
+ + + + + + + + +
+
+
+
+ +
+

Previous Issue

+
+

Volume 19, Number 4, October 2019

+ +
+
+ +
+
+ +
+
+
+

Additional Information

+
+
+ +
+
+ ISSN +
+
+ 1530-7131 +
+
+ + +
+
+ Print ISSN +
+
+ 1531-2542 +
+
+ + + + + + + + +
+
+ Launched on MUSE +
+
+ 2020-02-05 +
+
+ + + + +
+
+ Open Access +
+
+ + No + +
+
+ + + + + + +
+
+
+ + +
+

Copyright

+
+ + + + + +
+ + + +
+

Additional Issue Materials

+
+ + +
+ + +
+
+ + + + +
+
+ + + + + + +
+ Back To Top +
+ + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + diff --git a/spec/fixtures/journal-159.html b/spec/fixtures/journal-159.html new file mode 100644 index 0000000..e46aded --- /dev/null +++ b/spec/fixtures/journal-159.html @@ -0,0 +1,1522 @@ + + + + + + + + + + + + + + + Project MUSE - portal: Libraries and the Academy + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + + + + +
+
+ + +
+ + +
+ + + MUSE Alert +
+ + + + +
+

About this Journal

+
+ +
+ + + +
+ +
+
+
+

Table of Contents

+
+
+ + + + +
+
+ + + + +

Volume 20, 2020

+
    + + +
  1. + Volume 20, Number 1, January 2020 + Free Access +
  2. + + + +

    Volume 19, 2019

    +
      + + +
    1. + Volume 19, Number 4, October 2019 + Free Access +
    2. + + + +
    3. + Volume 19, Number 3, July 2019 + Free Access +
    4. + + + +
    5. + Volume 19, Number 2, April 2019 + Free Access +
    6. + + + +
    7. + Volume 19, Number 1, January 2019 + Free Access +
    8. + + + +

      Volume 18, 2018

      +
        + + +
      1. + Volume 18, Number 4, October 2018 + Free Access +
      2. + + + +
      3. + Volume 18, Number 3, July 2018 + Free Access +
      4. + + + +
      5. + Volume 18, Number 2, April 2018 + Free Access +
      6. + + + +
      7. + Volume 18, Number 1, January 2018 + Free Access +
      8. + + + +

        Volume 17, 2017

        +
          + + +
        1. + Volume 17, Number 4, October 2017 + Free Access +
        2. + + + +
        3. + Volume 17, Number 3, July 2017 + Free Access +
        4. + + + +
        5. + Volume 17, Number 2, April 2017 + Free Access +
        6. + + + +
        7. + Volume 17, Number 1, January 2017 + Free Access +
        8. + + + +

          Volume 16, 2016

          +
            + + +
          1. + Volume 16, Number 4, October 2016 + Free Access +
          2. + + + +
          3. + Volume 16, Number 3, July 2016 + Free Access +
          4. + + + +
          5. + Volume 16, Number 2, April 2016 + Free Access +
          6. + + + +
          7. + Volume 16, Number 1, January 2016 + Free Access +
          8. + + + +

            Volume 15, 2015

            +
              + + +
            1. + Volume 15, Number 4, October 2015 + Free Access +
            2. + + + +
            3. + Volume 15, Number 3, July 2015 + Free Access +
            4. + + + +
            5. + Volume 15, Number 2, April 2015 + Free Access +
            6. + + + +
            7. + Volume 15, Number 1, January 2015 + Free Access +
            8. + + + +

              Volume 14, 2014

              +
                + + +
              1. + Volume 14, Number 4, October 2014 + Free Access +
              2. + + + +
              3. + Volume 14, Number 3, July 2014 + Free Access +
              4. + + + +
              5. + Volume 14, Number 2, April 2014 + Free Access +
              6. + + + +
              7. + Volume 14, Number 1, January 2014 + Free Access +
              8. + + + +

                Volume 13, 2013

                +
                  + + +
                1. + Volume 13, Number 4, October 2013 + Free Access +
                2. + + + +
                3. + Volume 13, Number 3, July 2013 + Free Access +
                4. + + + +
                5. + Volume 13, Number 2, April 2013 + Free Access +
                6. + + + +
                7. + Volume 13, Number 1, January 2013 + Free Access +
                8. + + + +

                  Volume 12, 2012

                  +
                    + + +
                  1. + Volume 12, Number 4, October 2012 + Free Access +
                  2. + + + +
                  3. + Volume 12, Number 3, July 2012 + Free Access +
                  4. + + + +
                  5. + Volume 12, Number 2, April 2012 + Free Access +
                  6. + + + +
                  7. + Volume 12, Number 1, January 2012 + Free Access +
                  8. + + + +

                    Volume 11, 2011

                    +
                      + + +
                    1. + Volume 11, Number 4, October 2011 + Free Access +
                    2. + + + +
                    3. + Volume 11, Number 3, July 2011 + Free Access +
                    4. + + + +
                    5. + Volume 11, Number 2, April 2011 + Free Access +
                    6. + + + +
                    7. + Volume 11, Number 1, January 2011 + Free Access +
                    8. + + + +

                      Volume 10, 2010

                      +
                        + + +
                      1. + Volume 10, Number 4, October 2010 + Free Access +
                      2. + + + +
                      3. + Volume 10, Number 3, July 2010 + Free Access +
                      4. + + + +
                      5. + Volume 10, Number 2, April 2010 + Free Access +
                      6. + + + +
                      7. + Volume 10, Number 1, January 2010 + Free Access +
                      8. + + + +

                        Volume 9, 2009

                        +
                          + + +
                        1. + Volume 9, Number 4, October 2009 + Free Access +
                        2. + + + +
                        3. + Volume 9, Number 3, July 2009 + Free Access +
                        4. + + + +
                        5. + Volume 9, Number 2, April 2009 + Free Access +
                        6. + + + +
                        7. + Volume 9, Number 1, January 2009 + Free Access +
                        8. + + + +

                          Volume 8, 2008

                          +
                            + + +
                          1. + Volume 8, Number 4, October 2008 + Free Access +
                          2. + + + +
                          3. + Volume 8, Number 3, July 2008 + Free Access +
                          4. + + + +
                          5. + Volume 8, Number 2, April 2008 + Free Access +
                          6. + + + +
                          7. + Volume 8, Number 1, January 2008 + Free Access +
                          8. + + + +

                            Volume 7, 2007

                            +
                              + + +
                            1. + Volume 7, Number 4, October 2007 + Free Access +
                            2. + + + +
                            3. + Volume 7, Number 3, July 2007 + Free Access +
                            4. + + + +
                            5. + Volume 7, Number 2, April 2007 + Free Access +
                            6. + + + +
                            7. + Volume 7, Number 1, January 2007 + Free Access +
                            8. + + + +

                              Volume 6, 2006

                              +
                                + + +
                              1. + Volume 6, Number 4, October 2006 + Free Access +
                              2. + + + +
                              3. + Volume 6, Number 3, July 2006 + Free Access +
                              4. + + + +
                              5. + Volume 6, Number 2, April 2006 + Free Access +
                              6. + + + +
                              7. + Volume 6, Number 1, January 2006 + Free Access +
                              8. + + + +

                                Volume 5, 2005

                                +
                                  + + +
                                1. + Volume 5, Number 4, October 2005 + Free Access +
                                2. + + + +
                                3. + Volume 5, Number 3, July 2005 + Free Access +
                                4. + + + +
                                5. + Volume 5, Number 2, April 2005 + Free Access +
                                6. + + + +
                                7. + Volume 5, Number 1, January 2005 + Free Access +
                                8. + + + +

                                  Volume 4, 2004

                                  +
                                    + + +
                                  1. + Volume 4, Number 4, October 2004 + Free Access +
                                  2. + + + +
                                  3. + Volume 4, Number 3, July 2004 + Free Access +
                                  4. + + + +
                                  5. + Volume 4, Number 2, April 2004 + Free Access +
                                  6. + + + +
                                  7. + Volume 4, Number 1, January 2004 + Free Access +
                                  8. + + + +

                                    Volume 3, 2003

                                    +
                                      + + +
                                    1. + Volume 3, Number 4, October 2003 + Free Access +
                                    2. + + + +
                                    3. + Volume 3, Number 3, July 2003 + Free Access +
                                    4. + + + +
                                    5. + Volume 3, Number 2, April 2003 + Free Access +
                                    6. + + + +
                                    7. + Volume 3, Number 1, January 2003 + Free Access +
                                    8. + + + +

                                      Volume 2, 2002

                                      +
                                        + + +
                                      1. + Volume 2, Number 4, October 2002 + Free Access +
                                      2. + + + +
                                      3. + Volume 2, Number 3, July 2002 + Free Access +
                                      4. + + + +
                                      5. + Volume 2, Number 2, April 2002 + Free Access +
                                      6. + + + +
                                      7. + Volume 2, Number 1, January 2002 + Free Access +
                                      8. + + + +

                                        Volume 1, 2001

                                        +
                                          + + +
                                        1. + Volume 1, Number 4, October 2001 + Free Access +
                                        2. + + + +
                                        3. + Volume 1, Number 3, July 2001 + Free Access +
                                        4. + + + +
                                        5. + Volume 1, Number 2, April 2001 + Free Access +
                                        6. + + + +
                                        7. + Volume 1, Number 1, January 2001 + Free Access +
                                        8. + +
                                        + + +
+ +
+ + +
+ + + + + +
+
+
+ +
+
+
+

Additional Information

+
+
+ +
+
+ ISSN +
+
+ 1530-7131 +
+
+ + +
+
+ Print ISSN +
+
+ 1531-2542 +
+
+ + + + + + + + + +
+
+ Coverage Statement +
+
+ Vol. 1 (2001) through current issue +
+
+ + + +
+
+ Open Access +
+
+ + No + +
+
+ + + + + + +
+
+
+ + + + + +
+

Additional Materials

+
+ +
+ + + + + +
+

Additional Issue Materials

+
+
+ +
+ +
+ + +
+
+ + +
+ + + + + + +
+ Back To Top +
+ + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + diff --git a/spec/issue_spec.cr b/spec/issue_spec.cr new file mode 100644 index 0000000..5a42f69 --- /dev/null +++ b/spec/issue_spec.cr @@ -0,0 +1,85 @@ +require "../src/issue" +require "./spec_helper" +require "webmock" + +describe Muse::Dl::Issue do + WebMock.stub(:get, "https://muse.jhu.edu/issue/41793") + .to_return(body: File.new("spec/fixtures/issue-41793.html").gets_to_end) + + issue = Muse::Dl::Issue.new "41793" + issue.parse + + it "should initialize correctly" do + issue.id.should eq "41793" + issue.url.should eq "https://muse.jhu.edu/issue/41793" + end + + it "should parse info correctly" do + issue.info["ISSN"].should eq "1530-7131" + issue.info["Print ISSN"].should eq "1531-2542" + issue.info["Launched on MUSE"].should eq "2020-02-05" + issue.info["Open Access"].should eq "No" + issue.title.should eq "Volume 20, Number 1, January 2020" + end + + it "should parse title correctly" do + issue.volume.should eq "20" + issue.number.should eq "1" + issue.date.should eq "January 2020" + end + + it "should parser summary" do + issue.summary.should eq <<-EOT + Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association. + EOT + end + + it "should parse publisher" do + issue.publisher.should eq "Johns Hopkins University Press" + end + it "should parse the journal title" do + issue.journal_title.should eq "portal: Libraries and the Academy" + end + + it "should parse non-numbered issues" do + WebMock.stub(:get, "https://muse.jhu.edu/issue/35852") + .to_return(body: File.new("spec/fixtures/issue-35852.html").gets_to_end) + issue = Muse::Dl::Issue.new "35852" + issue.parse + + issue.volume.should eq "1" + issue.number.should eq "2" + issue.date.should eq "2016" + + issue.info["ISSN"].should eq "2474-9419" + issue.info["Print ISSN"].should eq "2474-9427" + issue.info["Launched on MUSE"].should eq "2017-02-21" + issue.info["Open Access"].should eq "Yes" + issue.title.should eq "Volume 1, Issue 2, 2016" + issue.journal_title.should eq "Constitutional Studies" + + expected_pages = [ + [1, 22], + [23, 40], + [41, 58], + [59, 80], + [81, 95], + [97, 116], + ] + + expected_titles = [ + "The Limits of Veneration: Public Support for a New Constitutional Convention", + "Secession and Nullification as a Global Trend", + "Challenging Constitutionalism in Post-Apartheid South Africa", + "Democracy by Lawsuit: Or, Can Litigation Alleviate the European Union’s “Democratic Deficit?”", + "Private Enforcement of Constitutional Guarantees in the Ku Klux Act of 1871", + "Sober Second Thoughts: Evaluating the History of Horizontal Judicial Review by the U.S. Supreme Court", + ] + + issue.articles.each_with_index do |a, i| + a.start_page.should eq expected_pages[i][0] + a.end_page.should eq expected_pages[i][1] + a.title.should eq expected_titles[i] + end + end +end diff --git a/spec/journal_spec.cr b/spec/journal_spec.cr new file mode 100644 index 0000000..7e0e5ec --- /dev/null +++ b/spec/journal_spec.cr @@ -0,0 +1,28 @@ +require "./spec_helper" + +describe Muse::Dl::Journal do + html = File.new("spec/fixtures/journal-159.html").gets_to_end + j = Muse::Dl::Journal.new html + + it "it should parse the infobox for 159" do + j.info["ISSN"].should eq "1530-7131" + j.info["Print ISSN"].should eq "1531-2542" + j.info["Coverage Statement"].should eq "Vol. 1 (2001) through current issue" + j.info["Open Access"].should eq "No" + end + + it "should parser summary" do + j.summary.should eq <<-EOT + Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association. + EOT + end + + it "should parse publisher" do + j.publisher.should eq "Johns Hopkins University Press" + end + + it "should return issues" do + j.issues[0].id.should eq "41793" + j.issues[-1].id.should eq "1578" + end +end diff --git a/spec/util_spec.cr b/spec/util_spec.cr new file mode 100644 index 0000000..c808941 --- /dev/null +++ b/spec/util_spec.cr @@ -0,0 +1,9 @@ +require "../src/util" +require "./spec_helper" + +describe Muse::Dl::Util do + it "should sanitize filenames properly" do + fn = Muse::Dl::Util.slug_filename("Hello world - \" :A$3, a story; a poem|chapter") + fn.should eq "Hello world - - -A-3, a story- a poem-chapter" + end +end diff --git a/src/article.cr b/src/article.cr new file mode 100644 index 0000000..bf6ac1c --- /dev/null +++ b/src/article.cr @@ -0,0 +1,19 @@ +require "./infoparser.cr" +require "./issue.cr" + +module Muse::Dl + class Article + getter id : String, :start_page, :end_page, :title + setter title : String | Nil, start_page : Int32 | Nil, end_page : Int32 | Nil + + def initialize(id : String) + @id = id + @url = "https://muse.jhu.edu/article/#{id}" + end + + # TODO: Fix this + def open_access + return false + end + end +end diff --git a/src/errors/missing_chapter.cr b/src/errors/missing_chapter.cr deleted file mode 100644 index b33487f..0000000 --- a/src/errors/missing_chapter.cr +++ /dev/null @@ -1,4 +0,0 @@ -module Muse::Dl::Errors - class MissingChapter < Exception - end -end diff --git a/src/errors/missing_file.cr b/src/errors/missing_file.cr new file mode 100644 index 0000000..f11f6fd --- /dev/null +++ b/src/errors/missing_file.cr @@ -0,0 +1,4 @@ +module Muse::Dl::Errors + class MissingFile < Exception + end +end diff --git a/src/fetch.cr b/src/fetch.cr index 004f9e0..213bdc5 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -14,6 +14,10 @@ module Muse::Dl "Connection" => "keep-alive", } + def self.article_file_name(id : String, tmp_path : String) + "#{tmp_path}/article-#{id}.pdf" + end + def self.chapter_file_name(id : String, tmp_path : String) "#{tmp_path}/chapter-#{id}.pdf" end @@ -23,24 +27,25 @@ module Muse::Dl File.delete(fns) if File.exists?(fns) end - def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, cookie : String | Nil = nil, add_bookmark = true, strip_first_page = true) - final_pdf_file = chapter_file_name chapter_id, tmp_path - tmp_pdf_file = "#{final_pdf_file}.tmp" + def self.cleanup_articles(tmp_path : String, id : String) + fns = article_file_name(id, tmp_path) + File.delete(fns) if File.exists?(fns) + end - if File.exists? final_pdf_file - puts "#{chapter_id} already downloaded" + def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true) + tmp_pdf_file = "#{file_name}.tmp" + if File.exists? file_name + puts "#{file_name} already downloaded" return end - # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class - url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" uri = URI.parse(url) http_client = HTTP::Client.new(uri) # Raise a IO::TimeoutError after 60 seconds. http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS headers = HEADERS.merge({ - "Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf", + "Referer" => referer, }) if cookie @@ -52,7 +57,7 @@ module Muse::Dl begin response = request.execute rescue ex : IO::TimeoutError - raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") + raise Muse::Dl::Errors::DownloadError.new("Error downloading #{url}. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.") end # TODO: Add validation for the downloaded file (should be PDF) @@ -63,7 +68,6 @@ module Muse::Dl content_type = response.headers["Content-Type"] if content_type.is_a? String if /html/.match content_type - puts response response.body.each_line do |line| # https://muse.jhu.edu/chapter/2383438/pdf # https://muse.jhu.edu/book/67393 @@ -77,6 +81,7 @@ module Muse::Dl end end end + File.open(tmp_pdf_file, "w") do |file| file << response.body if file.size == 0 @@ -88,18 +93,41 @@ module Muse::Dl pdftk.strip_first_page tmp_pdf_file if strip_first_page - if add_bookmark + if bookmark_title # Run pdftk and add the bookmark to the file - pdftk.add_bookmark tmp_pdf_file, chapter_title.strip + pdftk.add_bookmark tmp_pdf_file, bookmark_title end # Now we can move the file to the proper PDF filename - File.rename tmp_pdf_file, final_pdf_file + File.rename tmp_pdf_file, file_name + end + + def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, cookie : String | Nil = nil, add_bookmark = true, strip_first_page = true) + final_pdf_file = chapter_file_name chapter_id, tmp_path + + if File.exists? final_pdf_file + puts "#{chapter_id} already downloaded" + return + end + + # TODO: Remove this hardcoding, and make this more generic by generating it within the Book class + url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf" + referer = "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf" + + save_url(url, referer, final_pdf_file, tmp_path, cookie, chapter_title, strip_first_page) + puts "Downloaded #{chapter_id}" end - def self.get_info(url : String) : Muse::Dl::Thing | Nil - match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url + def self.save_article(tmp_path : String, article_id : String, cookie : String | Nil = nil, article_title = nil, strip_first_page = true) + file_name = article_file_name article_id, tmp_path + url = "https://muse.jhu.edu/article/#{article_id}/pdf" + referer = "https://muse.jhu.edu/article/#{article_id}" + save_url(url, referer, file_name, tmp_path, cookie, article_title, strip_first_page) + end + + def self.get_info(url : String) + match = /https:\/\/muse.jhu.edu\/(book|journal|issue|article)\/(\d+)/.match url if match begin response = Crest.get(url).to_s @@ -108,6 +136,10 @@ module Muse::Dl return Muse::Dl::Book.new response when "journal" return Muse::Dl::Journal.new response + when "issue" + return Muse::Dl::Issue.new match[2], response + when "article" + return Muse::Dl::Article.new match[2] end rescue ex : Crest::NotFound raise Muse::Dl::Errors::InvalidLink.new("Error - could not download url: #{url}") diff --git a/src/infoparser.cr b/src/infoparser.cr index c28252a..7e331db 100644 --- a/src/infoparser.cr +++ b/src/infoparser.cr @@ -34,6 +34,18 @@ module Muse::Dl myhtml.css("#book_about_info .title").map(&.inner_text).to_a[0].strip end + def self.issue_title(myhtml : Myhtml::Parser) + begin + myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip + rescue + nil + end + end + + def self.journal_title(myhtml : Myhtml::Parser) + myhtml.css("#journal_about_info .title").map(&.inner_text).to_a[0].strip + end + def self.author(myhtml : Myhtml::Parser) myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("
", ", ").gsub("\n", " ") end @@ -50,9 +62,13 @@ module Muse::Dl myhtml.css("#book_about_info .pub a").map(&.inner_text).to_a[0].strip end + def self.journal_publisher(myhtml : Myhtml::Parser) + myhtml.css(".card_publisher a").map(&.inner_text).to_a[0].strip + end + def self.summary(myhtml : Myhtml::Parser) begin - return myhtml.css("#book_about_info .card_summary").map(&.inner_text).to_a[0].strip + return myhtml.css(".card_summary").map(&.inner_text).to_a[0].strip rescue e : Exception STDERR.puts "Could not fetch summary" return "NA" diff --git a/src/issue.cr b/src/issue.cr new file mode 100644 index 0000000..73a83fb --- /dev/null +++ b/src/issue.cr @@ -0,0 +1,97 @@ +"./thing.cr" +require "./fetch.cr" +require "./article.cr" + +module Muse::Dl + class Issue + getter id : String, + title : String | Nil, + articles : Array(Muse::Dl::Article), + url : String, + summary : String | Nil, + publisher : String | Nil, + info : Hash(String, String), + volume : String | Nil, + number : String | Nil, + date : String | Nil, + journal_title : String | Nil + + setter :journal_title + + def initialize(id : String, response : String | Nil = nil) + @id = id + @url = "https://muse.jhu.edu/issue/#{id}" + @articles = [] of Muse::Dl::Article + parse(response) if response + @info = Hash(String, String).new + end + + def open_access + if @info.has_key? "Open Access" + return @info["Open Access"] == "Yes" + end + false + end + + def parse + html = Crest.get(@url).to_s + parse(html) + end + + def parse(html : String) + h = Myhtml::Parser.new html + @info = InfoParser.infobox(h) + @title = InfoParser.issue_title(h) + @summary = InfoParser.summary(h) + @publisher = InfoParser.journal_publisher(h) + parse_title + parse_contents(h) + end + + def parse_title + t = @title + unless t.nil? + @volume = /Volume (\d+)/.match(t).try &.[1] + @number = /Number (\d+)/.match(t).try &.[1] + @number = /Issue (\d+)/.match(t).try &.[1] unless @number + @date = /((January|February|March|April|May|June|July|August|September|October|November|December|Sring|Winter|Fall|Summer) (\d+))/.match(t).try &.[1] + @date = /(\d{4})/.match(t).try &.[1] unless @date + end + end + + def parse_contents(myhtml : Myhtml::Parser) + unless @journal_title + journal_title_a = myhtml.css("#journal_banner_title a").first + if journal_title_a + @journal_title = journal_title_a.inner_text + end + end + myhtml.css(".articles_list_text ol").each do |ol| + link = ol.css("li.title a").first + title = link.inner_text + + pages = ol.css("li.pg") + if pages.size > 0 + p = pages.first.try &.inner_text + matches = /(\d+)-(\d+)/.match p + if matches + start_page = matches[1].to_i + end_page = matches[2].to_i + end + end + + ol.css("a").each do |l| + url = l.attribute_by("href").to_s + matches = /\/article\/(\d+)\/pdf/.match url + if matches + a = Muse::Dl::Article.new matches[1] + a.title = title + a.start_page = start_page if start_page + a.end_page = end_page if end_page + @articles.push a + end + end + end + end + end +end diff --git a/src/journal.cr b/src/journal.cr index 1f3323a..d431824 100644 --- a/src/journal.cr +++ b/src/journal.cr @@ -1,6 +1,44 @@ -require "./thing.cr" +require "./infoparser.cr" +require "./issue.cr" module Muse::Dl - class Journal < Muse::Dl::Thing + class Journal + getter :info, :summary, :publisher, :issues, :title + @info = Hash(String, String).new + @summary : String + @publisher : String + @issues = [] of Muse::Dl::Issue + @title : String + + private getter :h + + def initialize(html) + @h = Myhtml::Parser.new html + @info = InfoParser.infobox(h) + @summary = InfoParser.summary(h) + @publisher = InfoParser.journal_publisher(h) + @title = InfoParser.journal_title(h) + parse_volumes(h) + end + + def open_access + if @info.has_key? "Open Access" + return @info["Open Access"] == "Yes" + end + false + end + + def parse_volumes(myhtml : Myhtml::Parser) + myhtml.css("#available_issues_list_text a").each do |a| + link = a.attribute_by("href").to_s + + matches = /\/issue\/(\d+)/.match link + if matches + issue = Muse::Dl::Issue.new matches[1] + issue.journal_title = @title + @issues.push issue + end + end + end end end diff --git a/src/muse-dl.cr b/src/muse-dl.cr index 9b71f65..93ccced 100644 --- a/src/muse-dl.cr +++ b/src/muse-dl.cr @@ -4,6 +4,7 @@ require "./fetch.cr" require "./book.cr" require "./journal.cr" require "./util.cr" +require "file_utils" module Muse::Dl VERSION = "1.1.2" @@ -11,9 +12,15 @@ module Muse::Dl class Main def self.dl(parser : Parser) url = parser.url + puts "Downloading #{url}" thing = Fetch.get_info(url) if url return unless thing + if (thing.open_access) && (parser.skip_oa) + STDERR.puts "Skipping #{url}, available under Open Access" + return + end + if thing.is_a? Muse::Dl::Book unless thing.formats.includes? :pdf STDERR.puts "Book not available in PDF format, skipping: #{url}" @@ -30,28 +37,23 @@ module Muse::Dl temp_stitched_file = nil pdf_builder = Pdftk.new(parser.tmp) - unless parser.input_pdf - # Save each chapter - thing.chapters.each do |chapter| - begin - Fetch.save_chapter(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first) - rescue e : Muse::Dl::Errors::MuseCorruptPDF - STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" - return - end + # Save each chapter + thing.chapters.each do |chapter| + begin + Fetch.save_chapter(parser.tmp, chapter[0], chapter[1], parser.cookie, parser.bookmarks, parser.strip_first) + rescue e : Muse::Dl::Errors::MuseCorruptPDF + STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" + return end - chapter_ids = thing.chapters.map { |c| c[0] } - - # Stitch the PDFs together - temp_stitched_file = pdf_builder.stitch chapter_ids - pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) - else - x = parser.input_pdf - pdf_builder.add_metadata(File.open(x), parser.output, thing) if x end + chapter_ids = thing.chapters.map { |c| c[0] } + + # Stitch the PDFs together + temp_stitched_file = pdf_builder.stitch chapter_ids + pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) temp_stitched_file.delete if temp_stitched_file - puts "--dont-strip-first-page was on. Please validate PDF file for any errors." + puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first puts "DL: #{url}. Saved final output to #{parser.output}" # Cleanup the chapter files @@ -60,6 +62,69 @@ module Muse::Dl Fetch.cleanup(parser.tmp, c[0]) end end + elsif thing.is_a? Muse::Dl::Article + # No bookmarks are needed since this is just a single article PDF + begin + Fetch.save_article(parser.tmp, thing.id, parser.cookie, nil, parser.strip_first) + rescue e : Muse::Dl::Errors::MuseCorruptPDF + STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" + return + end + + # TODO: Move this code elsewhere + source = Fetch.article_file_name(thing.id, parser.tmp) + destination = "article-#{thing.id}.pdf" + # Needed because of https://github.com/crystal-lang/crystal/issues/7777 + FileUtils.cp source, destination + FileUtils.rm source if parser.cleanup + elsif thing.is_a? Muse::Dl::Issue + # Will have no effect if parser has a custom title + parser.force_set_output Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf" + + # If file exists and we can't clobber + if File.exists?(parser.output) && parser.clobber == false + STDERR.puts "Skipping #{url}, File already exists: #{parser.output}" + return + end + temp_stitched_file = nil + pdf_builder = Pdftk.new(parser.tmp) + + thing.articles.each do |article| + begin + Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first) + rescue e : Muse::Dl::Errors::MuseCorruptPDF + STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}" + return + end + end + article_ids = thing.articles.map { |a| a.id } + + # Stitch the PDFs together + temp_stitched_file = pdf_builder.stitch_articles article_ids + pdf_builder.add_metadata(temp_stitched_file, parser.output, thing) + + # temp_stitched_file.delete if temp_stitched_file + puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first + puts "DL: #{url}. Saved final output to #{parser.output}" + + # Cleanup the issue files + if parser.cleanup + thing.articles.each do |a| + Fetch.cleanup_articles(parser.tmp, a.id) + end + end + elsif thing.is_a? Muse::Dl::Journal + thing.issues.each do |issue| + begin + # Update the issue + issue.parse + parser.url = issue.url + Main.dl parser + rescue e + puts e.message + puts "Faced an exception with previous issue, continuing" + end + end end end diff --git a/src/parser.cr b/src/parser.cr index 658b23b..e462445 100644 --- a/src/parser.cr +++ b/src/parser.cr @@ -10,15 +10,15 @@ module Muse::Dl @strip_first = true @output = DEFAULT_FILE_NAME @url : String | Nil - @input_pdf : String | Nil @clobber = false @input_list : String | Nil @cookie : String | Nil @h : Bool | Nil + @skip_oa = false DEFAULT_FILE_NAME = "tempfilename.pdf" - getter :bookmarks, :tmp, :cleanup, :output, :url, :input_pdf, :clobber, :input_list, :cookie, :strip_first + getter :bookmarks, :tmp, :cleanup, :output, :url, :clobber, :input_list, :cookie, :strip_first, :skip_oa setter :url # Update the output filename unless we have a custom one passed @@ -26,6 +26,10 @@ module Muse::Dl @output = output_file unless @output != DEFAULT_FILE_NAME end + def force_set_output(output_file : String) + @output = output_file + end + def reset_output_file @output = DEFAULT_FILE_NAME end @@ -41,7 +45,6 @@ module Muse::Dl def initialize(arg : Array(String) = [] of String) @tmp = Dir.tempdir - @input_pdf = nil parser = OptionParser.new parser.banner = <<-EOT @@ -56,10 +59,10 @@ module Muse::Dl parser.on(long_flag = "--tmp-dir PATH", description = "Temporary Directory to use") { |path| @tmp = path } parser.on(long_flag = "--output FILE", description = "Output Filename") { |file| @output = file } parser.on(long_flag = "--no-bookmarks", description = "Don't add bookmarks in the PDF") { @bookmarks = false } - parser.on(long_flag = "--input-pdf INPUT", description = "Input Stitched PDF. Will not download anything") { |input| @input_pdf = input } parser.on(long_flag = "--clobber", description = "Overwrite the output file, if it already exists. Not compatible with input-pdf") { @clobber = true } parser.on(long_flag = "--dont-strip-first-page", description = "Disables first page from being stripped. Use carefully") { @strip_first = false } parser.on(long_flag = "--cookie COOKIE", description = "Cookie-header") { |cookie| @cookie = cookie } + parser.on(long_flag = "--skip-open-access", description = "Don't download open access content") { @skip_oa = true } parser.on("-h", "--help", "Show this help") { @h = true; puts parser } parser.unknown_args do |args| @@ -70,7 +73,6 @@ module Muse::Dl end if File.exists? args[0] @input_list = args[0] - @input_pdf = nil else @url = args[0] end diff --git a/src/pdftk.cr b/src/pdftk.cr index 961f439..41ebcf0 100644 --- a/src/pdftk.cr +++ b/src/pdftk.cr @@ -70,7 +70,6 @@ module Muse::Dl def add_metadata(input_file : File, output_file : String, book : Book) # First we have to dump the current metadata - metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") keywords = "Publisher:#{book.publisher}, Published:#{book.date}" # Known Info keys, if they are present @@ -80,34 +79,51 @@ module Muse::Dl end end - text = <<-EOT + metadata_text = gen_metadata(book.title, keywords, book.summary.gsub(/\n\s+/, " "), book.author) + write_metadata(input_file, output_file, metadata_text) + end + + def gen_metadata(title : String, keywords : String, subject : String, author : String | Nil = nil) + metadata = <<-EOT InfoBegin InfoKey: Creator - InfoValue: Project MUSE (https://muse.jhu.edu/) + InfoValue: InfoBegin InfoKey: Producer - InfoValue: Muse-DL/#{Muse::Dl::VERSION} + InfoValue: InfoBegin InfoKey: Title - InfoValue: #{book.title} + InfoValue: #{title} InfoBegin InfoKey: Keywords InfoValue: #{keywords} InfoBegin - InfoKey: Author - InfoValue: #{book.author} - InfoBegin InfoKey: Subject - InfoValue: #{book.summary.gsub(/\n\s+/, " ")} + InfoValue: #{subject} InfoBegin InfoKey: ModDate InfoValue: InfoBegin InfoKey: CreationDate InfoValue: + EOT + unless author.nil? + metadata += <<-EOT + InfoBegin + InfoKey: Author + InfoValue: #{author} + EOT + end + + return metadata + end + + def write_metadata(input_file : File, output_file : String, text) + metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") File.write(metadata_text_file.path, text) + is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file] if !is_success raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.") @@ -115,11 +131,42 @@ module Muse::Dl metadata_text_file.delete end + def add_metadata(input_file : File, output_file : String, issue : Issue) + # First we have to dump the current metadata + metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt") + keywords = "Journal:#{issue.journal_title}, Published:#{issue.date},Volume:#{issue.volume},Number:#{issue.number}" + ["ISSN", "Print ISSN", "DOI", "Language", "Open Access"].each do |label| + if issue.info.has_key? label + keywords += ", #{label}:#{issue.info[label]}" + end + end + + # TODO: Move this to Issue class + + s = issue.summary + unless s.nil? + summary = s.gsub(/\n\s+/, " ") + else + summary = "NA" + end + + t = issue.title + + unless t.nil? + title = t + else + title = "NA" + end + # TODO: Add support for all authors in the PDF + metadata = gen_metadata(title, keywords, summary) + write_metadata(input_file, output_file, metadata) + end + def stitch(chapter_ids : Array(String)) output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf") # Do some sanity checks on each Chapter PDF chapter_ids.each do |id| - raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path) + raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path) raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0 end @@ -136,5 +183,28 @@ module Muse::Dl return output_file end + + # TODO: Merge with stitch + def stitch_articles(article_ids : Array(String)) + output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf") + # Do some sanity checks on each Chapter PDF + article_ids.each do |id| + raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.article_file_name(id, @tmp_file_path) + raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.article_file_name(id, @tmp_file_path)) > 0 + end + + # Now let's stitch them together + article_files = article_ids.map { |id| Fetch.article_file_name(id, @tmp_file_path) } + args = article_files + ["cat", "output", output_file.path] + is_success = execute args + + # TODO: Validate final file here + if !is_success + puts args + raise Muse::Dl::Errors::PDFOperationError.new("Error stitching articles together.") + end + + return output_file + end end end diff --git a/src/thing.cr b/src/thing.cr index 96a105a..2b0e3fd 100644 --- a/src/thing.cr +++ b/src/thing.cr @@ -19,6 +19,13 @@ module Muse::Dl private getter :h + def open_access + if @info.has_key? "Open Access" + return @info["Open Access"] == "Yes" + end + false + end + def initialize(html : String) @h = Myhtml::Parser.new html @info = InfoParser.infobox(h) diff --git a/src/util.cr b/src/util.cr index 5ed414a..0ddd1ae 100644 --- a/src/util.cr +++ b/src/util.cr @@ -2,7 +2,7 @@ module Muse::Dl class Util # Generates a safe filename def self.slug_filename(input : String) - input.strip.tr("\u{202E}%$|:;/\t\r\n\\", "-") + input.strip.tr("\u{202E}%$|:;/\"\t\r\n\\", "-") end end end