From db6d1c2c822d1e864e113df270d8e247b3ea53ea Mon Sep 17 00:00:00 2001 From: Nemo Date: Sun, 29 Mar 2020 02:32:18 +0530 Subject: [PATCH] Chapter parsing in place for books --- spec/book_spec.cr | 21 +++++++++++++++++++++ src/book.cr | 20 ++++++++++++++++++++ src/thing.cr | 6 ++++-- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/spec/book_spec.cr b/spec/book_spec.cr index 9feb6e8..0a94bef 100644 --- a/spec/book_spec.cr +++ b/spec/book_spec.cr @@ -29,6 +29,27 @@ describe Muse::Dl::Book do book.thumbnail_url.should eq "https://muse.jhu.edu/book/875/image/front_cover.jpg?format=180" end + it "should parse the chapters" do + book.chapters.should eq [ + ["16872", "Cover"], + ["16873", "Title Page"], + ["16874", "Copyright Page"], + ["16875", "Table of Contents"], + ["16876", "Acknowledgments"], + ["16877", "Introduction"], + ["16878", "Chapter 1: A Troubled Gentry"], + ["16879", "Chapter 2: Beyond the Plantations"], + ["16880", "Chapter 3: The World(s) Northern Neck Slavery Made"], + ["16881", "Chapter 4: The Scottish Merchants\n"], + ["16882", "Chapter 5: Controlling the Revolution\n"], + ["16883", "Chapter 6: The Evangelical Challenge"], + ["16884", "Chapter 7: The Preservation of Hegemony"], + ["16885", "Notes"], + ["16886", "Bibliography"], + ["16887", "Index"], + ] + end + it "it should parse the DOI for 68534" do html = File.new("spec/fixtures/book-68534.html").gets_to_end book = Muse::Dl::Book.new html diff --git a/src/book.cr b/src/book.cr index 90cce6b..6da03e9 100644 --- a/src/book.cr +++ b/src/book.cr @@ -2,5 +2,25 @@ require "./thing.cr" module Muse::Dl class Book < Muse::Dl::Thing + @chapters : Array(Array(String)) + + getter :chapters + + def initialize(html : String) + super(html) + @chapters = parts(@h) + end + + def parts(myhtml : Myhtml::Parser) + chapters = [] of Array(String) + myhtml.css(".title a").each do |a| + link = a.attribute_by("href").to_s + matches = /\/chapter\/(\d+)/.match link + if matches + chapters.push [matches[1], a.inner_text] + end + end + chapters + end end end diff --git a/src/thing.cr b/src/thing.cr index 1787180..663b958 100644 --- a/src/thing.cr +++ b/src/thing.cr @@ -12,11 +12,14 @@ module Muse::Dl @summary_html : String @cover_url : String @thumbnail_url : String + @h : Myhtml::Parser getter :info, :title, :author, :date, :publisher, :summary, :summary_html, :cover_url, :thumbnail_url + private getter :h + def initialize(html : String) - h = Myhtml::Parser.new html + @h = Myhtml::Parser.new html @info = InfoParser.infobox(h) @title = InfoParser.title(h) @author = InfoParser.author(h) @@ -24,7 +27,6 @@ module Muse::Dl @publisher = InfoParser.publisher(h) @summary = InfoParser.summary(h) @summary_html = InfoParser.summary_html(h) - @cover_url = "TODO" @thumbnail_url = "TODO" end