From d8702b2fcb3020fe65c92af16ac3d192f92ca8fb Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 8 Apr 2020 00:38:31 +0530 Subject: [PATCH] Initial work on parsing the journal page --- spec/fixtures/journal-159.html | 1522 ++++++++++++++++++++++++++++++++ spec/journal_spec.cr | 23 + src/fetch.cr | 2 +- src/infoparser.cr | 6 +- src/issue.cr | 6 + src/journal.cr | 18 +- 6 files changed, 1573 insertions(+), 4 deletions(-) create mode 100644 spec/fixtures/journal-159.html create mode 100644 spec/journal_spec.cr create mode 100644 src/issue.cr diff --git a/spec/fixtures/journal-159.html b/spec/fixtures/journal-159.html new file mode 100644 index 0000000..e46aded --- /dev/null +++ b/spec/fixtures/journal-159.html @@ -0,0 +1,1522 @@ + + + + + + + + + + + + + + + Project MUSE - portal: Libraries and the Academy + + + + + + + + + + + + + + + + + + + + + + + + + Article + + + + + + + + + + + +
+
+ + +
+ + +
+ + + MUSE Alert +
+ + + + +
+

About this Journal

+
+ +
+ + + +
+ +
+
+
+

Table of Contents

+
+
+ + + + +
+
+ + + + +

Volume 20, 2020

+
    + + +
  1. + Volume 20, Number 1, January 2020 + Free Access +
  2. + + + +

    Volume 19, 2019

    +
      + + +
    1. + Volume 19, Number 4, October 2019 + Free Access +
    2. + + + +
    3. + Volume 19, Number 3, July 2019 + Free Access +
    4. + + + +
    5. + Volume 19, Number 2, April 2019 + Free Access +
    6. + + + +
    7. + Volume 19, Number 1, January 2019 + Free Access +
    8. + + + +

      Volume 18, 2018

      +
        + + +
      1. + Volume 18, Number 4, October 2018 + Free Access +
      2. + + + +
      3. + Volume 18, Number 3, July 2018 + Free Access +
      4. + + + +
      5. + Volume 18, Number 2, April 2018 + Free Access +
      6. + + + +
      7. + Volume 18, Number 1, January 2018 + Free Access +
      8. + + + +

        Volume 17, 2017

        +
          + + +
        1. + Volume 17, Number 4, October 2017 + Free Access +
        2. + + + +
        3. + Volume 17, Number 3, July 2017 + Free Access +
        4. + + + +
        5. + Volume 17, Number 2, April 2017 + Free Access +
        6. + + + +
        7. + Volume 17, Number 1, January 2017 + Free Access +
        8. + + + +

          Volume 16, 2016

          +
            + + +
          1. + Volume 16, Number 4, October 2016 + Free Access +
          2. + + + +
          3. + Volume 16, Number 3, July 2016 + Free Access +
          4. + + + +
          5. + Volume 16, Number 2, April 2016 + Free Access +
          6. + + + +
          7. + Volume 16, Number 1, January 2016 + Free Access +
          8. + + + +

            Volume 15, 2015

            +
              + + +
            1. + Volume 15, Number 4, October 2015 + Free Access +
            2. + + + +
            3. + Volume 15, Number 3, July 2015 + Free Access +
            4. + + + +
            5. + Volume 15, Number 2, April 2015 + Free Access +
            6. + + + +
            7. + Volume 15, Number 1, January 2015 + Free Access +
            8. + + + +

              Volume 14, 2014

              +
                + + +
              1. + Volume 14, Number 4, October 2014 + Free Access +
              2. + + + +
              3. + Volume 14, Number 3, July 2014 + Free Access +
              4. + + + +
              5. + Volume 14, Number 2, April 2014 + Free Access +
              6. + + + +
              7. + Volume 14, Number 1, January 2014 + Free Access +
              8. + + + +

                Volume 13, 2013

                +
                  + + +
                1. + Volume 13, Number 4, October 2013 + Free Access +
                2. + + + +
                3. + Volume 13, Number 3, July 2013 + Free Access +
                4. + + + +
                5. + Volume 13, Number 2, April 2013 + Free Access +
                6. + + + +
                7. + Volume 13, Number 1, January 2013 + Free Access +
                8. + + + +

                  Volume 12, 2012

                  +
                    + + +
                  1. + Volume 12, Number 4, October 2012 + Free Access +
                  2. + + + +
                  3. + Volume 12, Number 3, July 2012 + Free Access +
                  4. + + + +
                  5. + Volume 12, Number 2, April 2012 + Free Access +
                  6. + + + +
                  7. + Volume 12, Number 1, January 2012 + Free Access +
                  8. + + + +

                    Volume 11, 2011

                    +
                      + + +
                    1. + Volume 11, Number 4, October 2011 + Free Access +
                    2. + + + +
                    3. + Volume 11, Number 3, July 2011 + Free Access +
                    4. + + + +
                    5. + Volume 11, Number 2, April 2011 + Free Access +
                    6. + + + +
                    7. + Volume 11, Number 1, January 2011 + Free Access +
                    8. + + + +

                      Volume 10, 2010

                      +
                        + + +
                      1. + Volume 10, Number 4, October 2010 + Free Access +
                      2. + + + +
                      3. + Volume 10, Number 3, July 2010 + Free Access +
                      4. + + + +
                      5. + Volume 10, Number 2, April 2010 + Free Access +
                      6. + + + +
                      7. + Volume 10, Number 1, January 2010 + Free Access +
                      8. + + + +

                        Volume 9, 2009

                        +
                          + + +
                        1. + Volume 9, Number 4, October 2009 + Free Access +
                        2. + + + +
                        3. + Volume 9, Number 3, July 2009 + Free Access +
                        4. + + + +
                        5. + Volume 9, Number 2, April 2009 + Free Access +
                        6. + + + +
                        7. + Volume 9, Number 1, January 2009 + Free Access +
                        8. + + + +

                          Volume 8, 2008

                          +
                            + + +
                          1. + Volume 8, Number 4, October 2008 + Free Access +
                          2. + + + +
                          3. + Volume 8, Number 3, July 2008 + Free Access +
                          4. + + + +
                          5. + Volume 8, Number 2, April 2008 + Free Access +
                          6. + + + +
                          7. + Volume 8, Number 1, January 2008 + Free Access +
                          8. + + + +

                            Volume 7, 2007

                            +
                              + + +
                            1. + Volume 7, Number 4, October 2007 + Free Access +
                            2. + + + +
                            3. + Volume 7, Number 3, July 2007 + Free Access +
                            4. + + + +
                            5. + Volume 7, Number 2, April 2007 + Free Access +
                            6. + + + +
                            7. + Volume 7, Number 1, January 2007 + Free Access +
                            8. + + + +

                              Volume 6, 2006

                              +
                                + + +
                              1. + Volume 6, Number 4, October 2006 + Free Access +
                              2. + + + +
                              3. + Volume 6, Number 3, July 2006 + Free Access +
                              4. + + + +
                              5. + Volume 6, Number 2, April 2006 + Free Access +
                              6. + + + +
                              7. + Volume 6, Number 1, January 2006 + Free Access +
                              8. + + + +

                                Volume 5, 2005

                                +
                                  + + +
                                1. + Volume 5, Number 4, October 2005 + Free Access +
                                2. + + + +
                                3. + Volume 5, Number 3, July 2005 + Free Access +
                                4. + + + +
                                5. + Volume 5, Number 2, April 2005 + Free Access +
                                6. + + + +
                                7. + Volume 5, Number 1, January 2005 + Free Access +
                                8. + + + +

                                  Volume 4, 2004

                                  +
                                    + + +
                                  1. + Volume 4, Number 4, October 2004 + Free Access +
                                  2. + + + +
                                  3. + Volume 4, Number 3, July 2004 + Free Access +
                                  4. + + + +
                                  5. + Volume 4, Number 2, April 2004 + Free Access +
                                  6. + + + +
                                  7. + Volume 4, Number 1, January 2004 + Free Access +
                                  8. + + + +

                                    Volume 3, 2003

                                    +
                                      + + +
                                    1. + Volume 3, Number 4, October 2003 + Free Access +
                                    2. + + + +
                                    3. + Volume 3, Number 3, July 2003 + Free Access +
                                    4. + + + +
                                    5. + Volume 3, Number 2, April 2003 + Free Access +
                                    6. + + + +
                                    7. + Volume 3, Number 1, January 2003 + Free Access +
                                    8. + + + +

                                      Volume 2, 2002

                                      +
                                        + + +
                                      1. + Volume 2, Number 4, October 2002 + Free Access +
                                      2. + + + +
                                      3. + Volume 2, Number 3, July 2002 + Free Access +
                                      4. + + + +
                                      5. + Volume 2, Number 2, April 2002 + Free Access +
                                      6. + + + +
                                      7. + Volume 2, Number 1, January 2002 + Free Access +
                                      8. + + + +

                                        Volume 1, 2001

                                        +
                                          + + +
                                        1. + Volume 1, Number 4, October 2001 + Free Access +
                                        2. + + + +
                                        3. + Volume 1, Number 3, July 2001 + Free Access +
                                        4. + + + +
                                        5. + Volume 1, Number 2, April 2001 + Free Access +
                                        6. + + + +
                                        7. + Volume 1, Number 1, January 2001 + Free Access +
                                        8. + +
                                        + + +
+ +
+ + +
+ + + + + +
+
+
+ +
+
+
+

Additional Information

+
+
+ +
+
+ ISSN +
+
+ 1530-7131 +
+
+ + +
+
+ Print ISSN +
+
+ 1531-2542 +
+
+ + + + + + + + + +
+
+ Coverage Statement +
+
+ Vol. 1 (2001) through current issue +
+
+ + + +
+
+ Open Access +
+
+ + No + +
+
+ + + + + + +
+
+
+ + + + + +
+

Additional Materials

+
+ +
+ + + + + +
+

Additional Issue Materials

+
+
+ +
+ +
+ + +
+
+ + +
+ + + + + + + + + + + + + + +
+

This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.

+ + +
+ + + + + + + + + + + + + diff --git a/spec/journal_spec.cr b/spec/journal_spec.cr new file mode 100644 index 0000000..bee1b04 --- /dev/null +++ b/spec/journal_spec.cr @@ -0,0 +1,23 @@ +require "./spec_helper" + +describe Muse::Dl::Journal do + html = File.new("spec/fixtures/journal-159.html").gets_to_end + j = Muse::Dl::Journal.new html + + it "it should parse the infobox for 159" do + j.info["ISSN"].should eq "1530-7131" + j.info["Print ISSN"].should eq "1531-2542" + j.info["Coverage Statement"].should eq "Vol. 1 (2001) through current issue" + j.info["Open Access"].should eq "No" + end + + it "should parser summary" do + j.summary.should eq <<-EOT + Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association. + EOT + end + + it "should parse publisher" do + j.publisher.should eq "Johns Hopkins University Press" + end +end diff --git a/src/fetch.cr b/src/fetch.cr index 004f9e0..6bc987a 100644 --- a/src/fetch.cr +++ b/src/fetch.cr @@ -98,7 +98,7 @@ module Muse::Dl puts "Downloaded #{chapter_id}" end - def self.get_info(url : String) : Muse::Dl::Thing | Nil + def self.get_info(url : String) match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url if match begin diff --git a/src/infoparser.cr b/src/infoparser.cr index c28252a..514fb91 100644 --- a/src/infoparser.cr +++ b/src/infoparser.cr @@ -50,9 +50,13 @@ module Muse::Dl myhtml.css("#book_about_info .pub a").map(&.inner_text).to_a[0].strip end + def self.journal_publisher(myhtml : Myhtml::Parser) + myhtml.css(".card_publisher a").map(&.inner_text).to_a[0].strip + end + def self.summary(myhtml : Myhtml::Parser) begin - return myhtml.css("#book_about_info .card_summary").map(&.inner_text).to_a[0].strip + return myhtml.css(".card_summary").map(&.inner_text).to_a[0].strip rescue e : Exception STDERR.puts "Could not fetch summary" return "NA" diff --git a/src/issue.cr b/src/issue.cr new file mode 100644 index 0000000..99e65a3 --- /dev/null +++ b/src/issue.cr @@ -0,0 +1,6 @@ +require "./thing.cr" + +module Muse::Dl + class Issue < Muse::Dl::Thing + end +end diff --git a/src/journal.cr b/src/journal.cr index 1f3323a..55c5eca 100644 --- a/src/journal.cr +++ b/src/journal.cr @@ -1,6 +1,20 @@ -require "./thing.cr" +require "./infoparser.cr" +require "myhtml" module Muse::Dl - class Journal < Muse::Dl::Thing + class Journal + getter :info, :summary, :publisher + @info = Hash(String, String).new + @summary : String + @publisher : String + + private getter :h + + def initialize(html) + @h = Myhtml::Parser.new html + @info = InfoParser.infobox(h) + @summary = InfoParser.summary(h) + @publisher = InfoParser.journal_publisher(h) + end end end