Initial work on parsing the journal page

pull/8/head
Nemo 3 years ago
parent fcc4f0c48b
commit d8702b2fcb
  1. 1522
      spec/fixtures/journal-159.html
  2. 23
      spec/journal_spec.cr
  3. 2
      src/fetch.cr
  4. 6
      src/infoparser.cr
  5. 6
      src/issue.cr
  6. 18
      src/journal.cr

File diff suppressed because it is too large Load Diff

@ -0,0 +1,23 @@
require "./spec_helper"
describe Muse::Dl::Journal do
html = File.new("spec/fixtures/journal-159.html").gets_to_end
j = Muse::Dl::Journal.new html
it "it should parse the infobox for 159" do
j.info["ISSN"].should eq "1530-7131"
j.info["Print ISSN"].should eq "1531-2542"
j.info["Coverage Statement"].should eq "Vol. 1 (2001) through current issue"
j.info["Open Access"].should eq "No"
end
it "should parser summary" do
j.summary.should eq <<-EOT
Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association.
EOT
end
it "should parse publisher" do
j.publisher.should eq "Johns Hopkins University Press"
end
end

@ -98,7 +98,7 @@ module Muse::Dl
puts "Downloaded #{chapter_id}"
end
def self.get_info(url : String) : Muse::Dl::Thing | Nil
def self.get_info(url : String)
match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url
if match
begin

@ -50,9 +50,13 @@ module Muse::Dl
myhtml.css("#book_about_info .pub a").map(&.inner_text).to_a[0].strip
end
def self.journal_publisher(myhtml : Myhtml::Parser)
myhtml.css(".card_publisher a").map(&.inner_text).to_a[0].strip
end
def self.summary(myhtml : Myhtml::Parser)
begin
return myhtml.css("#book_about_info .card_summary").map(&.inner_text).to_a[0].strip
return myhtml.css(".card_summary").map(&.inner_text).to_a[0].strip
rescue e : Exception
STDERR.puts "Could not fetch summary"
return "NA"

@ -0,0 +1,6 @@
require "./thing.cr"
module Muse::Dl
class Issue < Muse::Dl::Thing
end
end

@ -1,6 +1,20 @@
require "./thing.cr"
require "./infoparser.cr"
require "myhtml"
module Muse::Dl
class Journal < Muse::Dl::Thing
class Journal
getter :info, :summary, :publisher
@info = Hash(String, String).new
@summary : String
@publisher : String
private getter :h
def initialize(html)
@h = Myhtml::Parser.new html
@info = InfoParser.infobox(h)
@summary = InfoParser.summary(h)
@publisher = InfoParser.journal_publisher(h)
end
end
end

Loading…
Cancel
Save