Initial work on parsing the journal page

This commit is contained in:
Nemo 2020-04-08 00:38:31 +05:30
parent fcc4f0c48b
commit d8702b2fcb
6 changed files with 1573 additions and 4 deletions

1522
spec/fixtures/journal-159.html vendored Normal file

File diff suppressed because it is too large Load Diff

23
spec/journal_spec.cr Normal file
View File

@ -0,0 +1,23 @@
require "./spec_helper"
describe Muse::Dl::Journal do
html = File.new("spec/fixtures/journal-159.html").gets_to_end
j = Muse::Dl::Journal.new html
it "it should parse the infobox for 159" do
j.info["ISSN"].should eq "1530-7131"
j.info["Print ISSN"].should eq "1531-2542"
j.info["Coverage Statement"].should eq "Vol. 1 (2001) through current issue"
j.info["Open Access"].should eq "No"
end
it "should parser summary" do
j.summary.should eq <<-EOT
Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association.
EOT
end
it "should parse publisher" do
j.publisher.should eq "Johns Hopkins University Press"
end
end

View File

@ -98,7 +98,7 @@ module Muse::Dl
puts "Downloaded #{chapter_id}" puts "Downloaded #{chapter_id}"
end end
def self.get_info(url : String) : Muse::Dl::Thing | Nil def self.get_info(url : String)
match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url
if match if match
begin begin

View File

@ -50,9 +50,13 @@ module Muse::Dl
myhtml.css("#book_about_info .pub a").map(&.inner_text).to_a[0].strip myhtml.css("#book_about_info .pub a").map(&.inner_text).to_a[0].strip
end end
def self.journal_publisher(myhtml : Myhtml::Parser)
myhtml.css(".card_publisher a").map(&.inner_text).to_a[0].strip
end
def self.summary(myhtml : Myhtml::Parser) def self.summary(myhtml : Myhtml::Parser)
begin begin
return myhtml.css("#book_about_info .card_summary").map(&.inner_text).to_a[0].strip return myhtml.css(".card_summary").map(&.inner_text).to_a[0].strip
rescue e : Exception rescue e : Exception
STDERR.puts "Could not fetch summary" STDERR.puts "Could not fetch summary"
return "NA" return "NA"

6
src/issue.cr Normal file
View File

@ -0,0 +1,6 @@
require "./thing.cr"
module Muse::Dl
class Issue < Muse::Dl::Thing
end
end

View File

@ -1,6 +1,20 @@
require "./thing.cr" require "./infoparser.cr"
require "myhtml"
module Muse::Dl module Muse::Dl
class Journal < Muse::Dl::Thing class Journal
getter :info, :summary, :publisher
@info = Hash(String, String).new
@summary : String
@publisher : String
private getter :h
def initialize(html)
@h = Myhtml::Parser.new html
@info = InfoParser.infobox(h)
@summary = InfoParser.summary(h)
@publisher = InfoParser.journal_publisher(h)
end
end end
end end