mirror of https://github.com/captn3m0/muse-dl.git
Merge pull request #8 from captn3m0/journal-support
Adds Journal Support
This commit is contained in:
commit
a4f5c03912
3
Makefile
3
Makefile
|
@ -8,3 +8,6 @@ release:
|
|||
docker image save muse-dl-static | tar xf - --wildcards "*/layer.tar" -O | tar xf - "muse-dl-static"
|
||||
# And move it to the bin/ directory
|
||||
mv -f muse-dl-static bin/
|
||||
|
||||
test:
|
||||
crystal spec
|
||||
|
|
16
shard.lock
16
shard.lock
|
@ -1,14 +1,18 @@
|
|||
version: 1.0
|
||||
version: 2.0
|
||||
shards:
|
||||
crest:
|
||||
github: mamantoha/crest
|
||||
version: 0.24.1
|
||||
git: https://github.com/mamantoha/crest.git
|
||||
version: 0.25.1
|
||||
|
||||
http-client-digest_auth:
|
||||
github: mamantoha/http-client-digest_auth
|
||||
version: 0.3.0
|
||||
git: https://github.com/mamantoha/http-client-digest_auth.git
|
||||
version: 0.4.0
|
||||
|
||||
myhtml:
|
||||
github: kostya/myhtml
|
||||
git: https://github.com/kostya/myhtml.git
|
||||
version: 1.5.1
|
||||
|
||||
webmock:
|
||||
git: https://github.com/manastech/webmock.cr.git
|
||||
version: 0.13.0+git.commit.bb3eab30f6c7d1fdc0a7ff14cd136d68e860d1a7
|
||||
|
||||
|
|
|
@ -16,3 +16,8 @@ dependencies:
|
|||
github: kostya/myhtml
|
||||
crest:
|
||||
github: mamantoha/crest
|
||||
|
||||
development_dependencies:
|
||||
webmock:
|
||||
github: manastech/webmock.cr
|
||||
branch: master
|
|
@ -1,7 +1,12 @@
|
|||
require "./spec_helper"
|
||||
require "webmock"
|
||||
# require "errors/muse_corrupt_pdf.cr"
|
||||
|
||||
describe Muse::Dl::Book do
|
||||
headers = {"Content-Type" => "text/html"}
|
||||
WebMock.stub(:get, "https://muse.jhu.edu/chapter/2379787/pdf")
|
||||
.to_return(body_io: File.new("spec/fixtures/chapter-2379787.html"), headers: headers)
|
||||
|
||||
it "should notice the unable to construct chapter PDF error" do
|
||||
f = "/tmp/chapter-2379787.pdf"
|
||||
File.delete(f) if File.exists? f
|
||||
|
|
|
@ -0,0 +1,359 @@
|
|||
<style>
|
||||
.page404 {
|
||||
display: table;
|
||||
width: 100%;
|
||||
padding: 60px 4em;
|
||||
min-height: 350px;
|
||||
}
|
||||
.page404 .int {
|
||||
display: table-cell;
|
||||
vertical-align: middle;
|
||||
text-align: left;
|
||||
}
|
||||
.page404 h4 {
|
||||
margin-bottom: 10px;
|
||||
font-weight: 700;
|
||||
}
|
||||
.page404 .logo {
|
||||
display: table-cell;
|
||||
width: 23%;
|
||||
vertical-align: middle;
|
||||
padding-right: 30px;
|
||||
}
|
||||
.page404 blockquote {
|
||||
border: none;
|
||||
padding-left: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<!-- Global site tag (gtag.js) - Google Analytics -->
|
||||
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-58347753-2"></script>
|
||||
<script>
|
||||
window.dataLayer = window.dataLayer || [];
|
||||
function gtag(){dataLayer.push(arguments);}
|
||||
gtag('js', new Date());
|
||||
gtag('config', 'UA-58347753-2');
|
||||
</script>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<meta property="og:image" content="/images/muselogo_dark.jpg" />
|
||||
|
||||
|
||||
|
||||
<title>Project MUSE</title>
|
||||
<link rel="search" type="application/opensearchdescription+xml" title="Search Project MUSE from your browser's Searchbar" href="/plugins/muse-opensearch.xml" />
|
||||
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="/css/normalize.css"/>
|
||||
<link href="/css/jquery.qtip2.css" rel="stylesheet" type="text/css" />
|
||||
<!-- foundation 6.4.1 custom float/typ/vis 250rem max width 30col float grid -->
|
||||
<link href="https://fonts.googleapis.com/css?family=Source+Sans+Pro:300,400,400i,600,600i,700,700i" rel="stylesheet">
|
||||
<link rel="stylesheet" type="text/css" href="/css/foundation.min.css"/>
|
||||
<link rel="stylesheet" type="text/css" href="/css/style_home2.css?031820"/>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="/js/jquery3.js"></script>
|
||||
<script type="text/javascript" src="/js/pre.js"></script>
|
||||
<script type="text/javascript" src="/js/core/head.js?new"></script>
|
||||
|
||||
<script type="text/javascript" src="https://s7.addthis.com/js/250/addthis_widget.js#pubid=ra-4ecb5479089cb81a"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<title>Article</title>
|
||||
</head>
|
||||
<body>
|
||||
<a id="skip" href="#skip_target">[Skip to Content]</a>
|
||||
<span id="top"></span>
|
||||
<div id="header" role="banner" aria-label="header">
|
||||
<div class="row wrap" id="institution_banner">
|
||||
<div class="content">
|
||||
<div id="institution_wrap" class="columns small-15 medium-text-left">
|
||||
<div id="institution" class="img_text_col">
|
||||
<div class="img_contain_left"><img src="/images/institution.png" alt="institution icon" /></div>
|
||||
<div class="text_contain_left"><span class="small"><a href='/account' class='color_white login_status'>Institutional Login</a></span></div>
|
||||
</div>
|
||||
</div>
|
||||
<div id="person_wrap" class="columns small-15">
|
||||
<div id="person" class="img_text_col">
|
||||
<div class="img_contain_right"><img src="/images/person.png" alt="account icon" /></div>
|
||||
<div class="text_contain_right"><span class="small"><a href="/account/" class="color_white login_status" onclick="gtag('event', 'click', {'event_category': 'Account link', 'event_label': 'account name link - header'});">LOG IN</a></span></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="row wrap" id="search_banner">
|
||||
<div class="content">
|
||||
<div class="medium-4 small-4 columns" id="header_logo_wrap">
|
||||
<div id="header_logo">
|
||||
<a href="/"><img src="/images/muselogo.png" alt="Project MUSE" class="show-for-large"/>
|
||||
<img src="/images/muselogo_notext.png" alt="Project MUSE" class="hide-for-large"/></a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="medium-21 small-22 columns" id="search_bar_wrap">
|
||||
<div class="row">
|
||||
<div id="browse_button_wrap">
|
||||
<a id="browse_button" href="/browse" onclick="gtag('event', 'click', {'event_category': 'Browse link', 'event_label': 'browse button - header'});"><span class="small">browse</span></a>
|
||||
</div>
|
||||
<div id="or_text_wrap" class="show-for-medium">
|
||||
<div id="or_text">
|
||||
<span class="small">or</span>
|
||||
</div>
|
||||
</div>
|
||||
<div id="search_input_wrap" class="small-30">
|
||||
<div id="search_input">
|
||||
|
||||
<noscript>
|
||||
<form method="post" action="/search/">
|
||||
<input name="no_js_header_query"/>
|
||||
<input type="hidden" name="action" value="search"/>
|
||||
<input type="hidden" name="t" value="header"/>
|
||||
<a id="search_button">
|
||||
|
||||
<input type="image" src="/images/search_white.png" alt="Search icon"/>
|
||||
|
||||
</a>
|
||||
</form>
|
||||
</noscript>
|
||||
|
||||
<script>document.write('<input name="search_input_header" id="search_input_header" aria-label="search input"/>');</script>
|
||||
|
||||
<script>document.write('<a id="search_button"><img src="/images/search_white.png" alt="Search icon"/></a>');</script>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="medium-5 small-4 columns" id="menu_wrap">
|
||||
<div id="menu" class="menu-btn">
|
||||
<div class="nav-toggle">
|
||||
<div class="nav-toggle-btn">
|
||||
<a href="#" class="menu-icon-wrap">
|
||||
<span class="icon"></span>
|
||||
<span class="small show-for-large">menu</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div class="nav-mobile">
|
||||
<a href="/search">Advanced Search</a>
|
||||
<a href="/browse">Browse</a>
|
||||
<script>
|
||||
document.write('<div class="accordion">');
|
||||
</script>
|
||||
<noscript>
|
||||
<div class="accordion noscript">
|
||||
</noscript>
|
||||
<a href="#" class="acc_trig open"><span>MyMUSE Account</span></a>
|
||||
<div class="acc_block">
|
||||
<a href="/account">Log In / Sign Up</a>
|
||||
<a href="/account/change">Change My Account</a>
|
||||
<a href="/account/user_settings">User Settings</a>
|
||||
<a href="/account/">Access via Institution</a>
|
||||
<a href="/account/saved_items">MyMUSE Library</a>
|
||||
<a href="/account/search_history">Search History</a>
|
||||
<a href="/account/view_history">View History</a>
|
||||
<a href="/account/purchase_history">Purchase History</a>
|
||||
<a href="/account/alerts">MyMUSE Alerts</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="nav-mobile-footer">
|
||||
<!--<a class="modal_trigger">Contact Support</a>-->
|
||||
<a href="/contact">Contact Support</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="page404" id="main">
|
||||
<div class="logo">
|
||||
<img src="/images/muselogo_notext.png" alt="MUSE logo">
|
||||
</div>
|
||||
<div class="int">
|
||||
<html><head><title>Error</title></head><body>Unable to construct chapter PDF</body></html>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div id="footer_block" role="banner" aria-label="footer">
|
||||
<div class="content">
|
||||
<div class="wrap row" id="about_wrap">
|
||||
<div id="about">
|
||||
<h3>Project MUSE Mission</h3>
|
||||
<p>Project MUSE promotes the creation and dissemination of essential humanities and social science resources through collaboration with libraries, publishers, and scholars worldwide. Forged from a partnership between a university press and a library, Project MUSE is a trusted part of the academic and scholarly community it serves.</p>
|
||||
</div>
|
||||
<div id="about_logo" class="columns medium-10 show-for-large">
|
||||
<img src="/images/muselogo_notext.png" alt="MUSE logo"/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="footer_main">
|
||||
<div class="footer_item_color wrap">
|
||||
<div class="footer_item_left">
|
||||
<div class="group">
|
||||
<div class="footer_item_about cont_sub">
|
||||
<h5 class="small">about</h5>
|
||||
<ul>
|
||||
<li><a href="https://about.muse.jhu.edu/publishers">Publishers</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/about/discovery-partners/">Discovery Partners</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/about/advisory-board/">Advisory Board</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/about/journal-subscribers/">Journal Subscribers</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/about/book-customers">Book Customers</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/about/at-conferences/">Conferences</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="footer_item_res cont_sub">
|
||||
<h5 class="small">resources</h5>
|
||||
<ul>
|
||||
<li><a href="https://about.muse.jhu.edu/resources/news/">News & Announcements</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/resources/promotional-materials">Promotional Material</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/resources/alerts">Get Alerts</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/resources/muse-presentations">Presentations</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="clear"></div>
|
||||
</div>
|
||||
<div class="group">
|
||||
<div class="footer_item_what cont_sub">
|
||||
<h5 class="small">what's on muse</h5>
|
||||
<ul>
|
||||
<li><a href="https://about.muse.jhu.edu/muse">Open Access</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/pub/journals">Journals</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/pub/books">Books</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="footer_item_info cont_sub">
|
||||
<h5 class="small">information for</h5>
|
||||
<ul>
|
||||
<li><a href="https://about.muse.jhu.edu/publishers">Publishers</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/librarians">Librarians</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/individuals">Individuals</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="clear"></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="footer_item_right">
|
||||
<div class="group">
|
||||
<div class="footer_item_social cont_sub">
|
||||
<h5 class="small">Contact</h5>
|
||||
<ul>
|
||||
<li class="clear"><a href="/contact">Contact Us</a></li>
|
||||
<li><a href="https://about.muse.jhu.edu/resources/help-overview">Help</a></li>
|
||||
</ul>
|
||||
<ul>
|
||||
<li>
|
||||
<ol class="social_icons">
|
||||
<li class="list_h"><a href="https://www.facebook.com/ProjectMUSE"><img src="/images/footer_icon_fb.png" alt="Facebook" /></a></li>
|
||||
<li class="list_h"><a href="https://www.linkedin.com/company/projectmuse/"><img src="/images/footer_icon_linkedin.png" alt="Linkedin" /></a></li>
|
||||
<li class="list_h"><a href="https://twitter.com/ProjectMUSE"><img src="/images/footer_icon_twitter.png" alt="Twitter" /></a></li>
|
||||
</ol>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="footer_item_policy cont_sub">
|
||||
<h5 class="small">Policy & Terms</h5>
|
||||
<ul>
|
||||
<li><a href="https://about.muse.jhu.edu/about/accessibility/">Accessibility</a></li>
|
||||
<li><a href="/privacy_policy">Privacy Policy</a></li>
|
||||
<li><a href="/terms_use">Terms of Use</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="clear"></div>
|
||||
</div>
|
||||
<div class="group">
|
||||
<div class="footer_item_addr cont_sub">
|
||||
<p class="address"><span>2715 North Charles Street<br/>Baltimore, Maryland, USA 21218</span></p>
|
||||
<p class="phone"><span><a href="tel:1-410-516-6989">+1 (410) 516-6989</a></span><br>
|
||||
<span><a href="mailto:muse@press.jhu.edu">muse@press.jhu.edu</a></span></p>
|
||||
<p class="footer_text_sm copy color_oxfordblue hide-for-small"><span>©2020 Project MUSE. Produced by Johns Hopkins University Press in collaboration with The Sheridan Libraries.</span></p>
|
||||
</div>
|
||||
<div class="footer_item_logo cont_sub">
|
||||
<p class="show-for-medium"><span class="semiboldit footer_text_sm">Now and always,<br/>The Trusted Content Your Research Requires.</span></p>
|
||||
<p><span><a href="https://muse.jhu.edu">
|
||||
|
||||
<img class="show-for-medium" src="/images/muselogoblack.png" alt="Project MUSE logo" />
|
||||
|
||||
<img class="hide-for-medium" src="/images/muselogo.png" alt="Project MUSE logo" /></a></span></p>
|
||||
<p class="hide-for-medium"><span class="semiboldit footer_text_sm">Now and always, The Trusted Content Your Research Requires.</span></p>
|
||||
<p class="hide-for-small"><span class="footer_text_sm">Built on the Johns Hopkins University Campus</span></p>
|
||||
</div>
|
||||
<div class="clear"></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="clear"></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="footer_item_sub wrap hide-for-medium">
|
||||
<p><span class="footer_text_sm">Built on the Johns Hopkins University Campus</span></p>
|
||||
<p class="footer_text_sm copy color_oxfordblue"><span>©2020 Project MUSE. Produced by Johns Hopkins University Press in collaboration with The Sheridan Libraries.</span></p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
<div id="btn_top">
|
||||
<a href="#top"><span>Back To Top</span></a>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<input type="hidden" name="cookie_acknowledgement_type" id="cookie_acknowledgement_type" value="cookie_acknowledgement">
|
||||
|
||||
|
||||
|
||||
<div id="cookies_msg">
|
||||
<p>This website uses cookies to ensure you get the best experience on our website. Without cookies your experience may not be seamless.</p>
|
||||
<script>document.writeln('<a href="javascript://" class="btn_accept" id="accept_cookie_msg">Accept</a>');</script>
|
||||
<noscript>
|
||||
|
||||
<form method="post" action="/account/set_attribute_no_ajax/cookie_acknowledgement/1">
|
||||
|
||||
<input type="submit" class="btn_accept" value="accept">
|
||||
</form>
|
||||
</noscript>
|
||||
</div>
|
||||
|
||||
|
||||
<script type="text/javascript" src="/js/lightbox.js"></script>
|
||||
<script type="text/javascript" src="/js/jquery.qtip2.min.js"></script>
|
||||
<script type="text/javascript" src="/js/post.js"></script>
|
||||
|
||||
<script type="text/javascript" src="/js/footnotes.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" src="/js/references.js"></script>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,85 @@
|
|||
require "../src/issue"
|
||||
require "./spec_helper"
|
||||
require "webmock"
|
||||
|
||||
describe Muse::Dl::Issue do
|
||||
WebMock.stub(:get, "https://muse.jhu.edu/issue/41793")
|
||||
.to_return(body: File.new("spec/fixtures/issue-41793.html").gets_to_end)
|
||||
|
||||
issue = Muse::Dl::Issue.new "41793"
|
||||
issue.parse
|
||||
|
||||
it "should initialize correctly" do
|
||||
issue.id.should eq "41793"
|
||||
issue.url.should eq "https://muse.jhu.edu/issue/41793"
|
||||
end
|
||||
|
||||
it "should parse info correctly" do
|
||||
issue.info["ISSN"].should eq "1530-7131"
|
||||
issue.info["Print ISSN"].should eq "1531-2542"
|
||||
issue.info["Launched on MUSE"].should eq "2020-02-05"
|
||||
issue.info["Open Access"].should eq "No"
|
||||
issue.title.should eq "Volume 20, Number 1, January 2020"
|
||||
end
|
||||
|
||||
it "should parse title correctly" do
|
||||
issue.volume.should eq "20"
|
||||
issue.number.should eq "1"
|
||||
issue.date.should eq "January 2020"
|
||||
end
|
||||
|
||||
it "should parser summary" do
|
||||
issue.summary.should eq <<-EOT
|
||||
Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association.
|
||||
EOT
|
||||
end
|
||||
|
||||
it "should parse publisher" do
|
||||
issue.publisher.should eq "Johns Hopkins University Press"
|
||||
end
|
||||
it "should parse the journal title" do
|
||||
issue.journal_title.should eq "portal: Libraries and the Academy"
|
||||
end
|
||||
|
||||
it "should parse non-numbered issues" do
|
||||
WebMock.stub(:get, "https://muse.jhu.edu/issue/35852")
|
||||
.to_return(body: File.new("spec/fixtures/issue-35852.html").gets_to_end)
|
||||
issue = Muse::Dl::Issue.new "35852"
|
||||
issue.parse
|
||||
|
||||
issue.volume.should eq "1"
|
||||
issue.number.should eq "2"
|
||||
issue.date.should eq "2016"
|
||||
|
||||
issue.info["ISSN"].should eq "2474-9419"
|
||||
issue.info["Print ISSN"].should eq "2474-9427"
|
||||
issue.info["Launched on MUSE"].should eq "2017-02-21"
|
||||
issue.info["Open Access"].should eq "Yes"
|
||||
issue.title.should eq "Volume 1, Issue 2, 2016"
|
||||
issue.journal_title.should eq "Constitutional Studies"
|
||||
|
||||
expected_pages = [
|
||||
[1, 22],
|
||||
[23, 40],
|
||||
[41, 58],
|
||||
[59, 80],
|
||||
[81, 95],
|
||||
[97, 116],
|
||||
]
|
||||
|
||||
expected_titles = [
|
||||
"The Limits of Veneration: Public Support for a New Constitutional Convention",
|
||||
"Secession and Nullification as a Global Trend",
|
||||
"Challenging Constitutionalism in Post-Apartheid South Africa",
|
||||
"Democracy by Lawsuit: Or, Can Litigation Alleviate the European Union’s “Democratic Deficit?”",
|
||||
"Private Enforcement of Constitutional Guarantees in the Ku Klux Act of 1871",
|
||||
"Sober Second Thoughts: Evaluating the History of Horizontal Judicial Review by the U.S. Supreme Court",
|
||||
]
|
||||
|
||||
issue.articles.each_with_index do |a, i|
|
||||
a.start_page.should eq expected_pages[i][0]
|
||||
a.end_page.should eq expected_pages[i][1]
|
||||
a.title.should eq expected_titles[i]
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,28 @@
|
|||
require "./spec_helper"
|
||||
|
||||
describe Muse::Dl::Journal do
|
||||
html = File.new("spec/fixtures/journal-159.html").gets_to_end
|
||||
j = Muse::Dl::Journal.new html
|
||||
|
||||
it "it should parse the infobox for 159" do
|
||||
j.info["ISSN"].should eq "1530-7131"
|
||||
j.info["Print ISSN"].should eq "1531-2542"
|
||||
j.info["Coverage Statement"].should eq "Vol. 1 (2001) through current issue"
|
||||
j.info["Open Access"].should eq "No"
|
||||
end
|
||||
|
||||
it "should parser summary" do
|
||||
j.summary.should eq <<-EOT
|
||||
Focusing on important research about the role of academic libraries and librarianship, portal also features commentary on issues in technology and publishing. Written for all those interested in the role of libraries within the academy, portal includes peer-reviewed articles addressing subjects such as library administration, information technology, and information policy. In its inaugural year, portal earned recognition as the runner-up for best new journal, awarded by the Council of Editors of Learned Journals (CELJ). An article in portal, "Master's and Doctoral Thesis Citations: Analysis and Trends of a Longitudinal Study," won the Jesse H. Shera Award for Distinguished Published Research from the Library Research Round Table of the American Library Association.
|
||||
EOT
|
||||
end
|
||||
|
||||
it "should parse publisher" do
|
||||
j.publisher.should eq "Johns Hopkins University Press"
|
||||
end
|
||||
|
||||
it "should return issues" do
|
||||
j.issues[0].id.should eq "41793"
|
||||
j.issues[-1].id.should eq "1578"
|
||||
end
|
||||
end
|
|
@ -0,0 +1,9 @@
|
|||
require "../src/util"
|
||||
require "./spec_helper"
|
||||
|
||||
describe Muse::Dl::Util do
|
||||
it "should sanitize filenames properly" do
|
||||
fn = Muse::Dl::Util.slug_filename("Hello world - \" :A$3, a story; a poem|chapter")
|
||||
fn.should eq "Hello world - - -A-3, a story- a poem-chapter"
|
||||
end
|
||||
end
|
|
@ -0,0 +1,19 @@
|
|||
require "./infoparser.cr"
|
||||
require "./issue.cr"
|
||||
|
||||
module Muse::Dl
|
||||
class Article
|
||||
getter id : String, :start_page, :end_page, :title
|
||||
setter title : String | Nil, start_page : Int32 | Nil, end_page : Int32 | Nil
|
||||
|
||||
def initialize(id : String)
|
||||
@id = id
|
||||
@url = "https://muse.jhu.edu/article/#{id}"
|
||||
end
|
||||
|
||||
# TODO: Fix this
|
||||
def open_access
|
||||
return false
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,4 +0,0 @@
|
|||
module Muse::Dl::Errors
|
||||
class MissingChapter < Exception
|
||||
end
|
||||
end
|
|
@ -0,0 +1,4 @@
|
|||
module Muse::Dl::Errors
|
||||
class MissingFile < Exception
|
||||
end
|
||||
end
|
62
src/fetch.cr
62
src/fetch.cr
|
@ -14,6 +14,10 @@ module Muse::Dl
|
|||
"Connection" => "keep-alive",
|
||||
}
|
||||
|
||||
def self.article_file_name(id : String, tmp_path : String)
|
||||
"#{tmp_path}/article-#{id}.pdf"
|
||||
end
|
||||
|
||||
def self.chapter_file_name(id : String, tmp_path : String)
|
||||
"#{tmp_path}/chapter-#{id}.pdf"
|
||||
end
|
||||
|
@ -23,24 +27,25 @@ module Muse::Dl
|
|||
File.delete(fns) if File.exists?(fns)
|
||||
end
|
||||
|
||||
def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, cookie : String | Nil = nil, add_bookmark = true, strip_first_page = true)
|
||||
final_pdf_file = chapter_file_name chapter_id, tmp_path
|
||||
tmp_pdf_file = "#{final_pdf_file}.tmp"
|
||||
def self.cleanup_articles(tmp_path : String, id : String)
|
||||
fns = article_file_name(id, tmp_path)
|
||||
File.delete(fns) if File.exists?(fns)
|
||||
end
|
||||
|
||||
if File.exists? final_pdf_file
|
||||
puts "#{chapter_id} already downloaded"
|
||||
def self.save_url(url : String, referer : String, file_name : String, tmp_path : String, cookie : String | Nil = nil, bookmark_title : String | Nil = nil, strip_first_page = true)
|
||||
tmp_pdf_file = "#{file_name}.tmp"
|
||||
if File.exists? file_name
|
||||
puts "#{file_name} already downloaded"
|
||||
return
|
||||
end
|
||||
|
||||
# TODO: Remove this hardcoding, and make this more generic by generating it within the Book class
|
||||
url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
|
||||
uri = URI.parse(url)
|
||||
http_client = HTTP::Client.new(uri)
|
||||
# Raise a IO::TimeoutError after 60 seconds.
|
||||
http_client.read_timeout = DOWNLOAD_TIMEOUT_SECS
|
||||
|
||||
headers = HEADERS.merge({
|
||||
"Referer" => "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf",
|
||||
"Referer" => referer,
|
||||
})
|
||||
|
||||
if cookie
|
||||
|
@ -52,7 +57,7 @@ module Muse::Dl
|
|||
begin
|
||||
response = request.execute
|
||||
rescue ex : IO::TimeoutError
|
||||
raise Muse::Dl::Errors::DownloadError.new("Error downloading chapter. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.")
|
||||
raise Muse::Dl::Errors::DownloadError.new("Error downloading #{url}. Download took longer than #{DOWNLOAD_TIMEOUT_SECS} seconds.")
|
||||
end
|
||||
|
||||
# TODO: Add validation for the downloaded file (should be PDF)
|
||||
|
@ -63,7 +68,6 @@ module Muse::Dl
|
|||
content_type = response.headers["Content-Type"]
|
||||
if content_type.is_a? String
|
||||
if /html/.match content_type
|
||||
puts response
|
||||
response.body.each_line do |line|
|
||||
# https://muse.jhu.edu/chapter/2383438/pdf
|
||||
# https://muse.jhu.edu/book/67393
|
||||
|
@ -77,6 +81,7 @@ module Muse::Dl
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
File.open(tmp_pdf_file, "w") do |file|
|
||||
file << response.body
|
||||
if file.size == 0
|
||||
|
@ -88,18 +93,41 @@ module Muse::Dl
|
|||
|
||||
pdftk.strip_first_page tmp_pdf_file if strip_first_page
|
||||
|
||||
if add_bookmark
|
||||
if bookmark_title
|
||||
# Run pdftk and add the bookmark to the file
|
||||
pdftk.add_bookmark tmp_pdf_file, chapter_title.strip
|
||||
pdftk.add_bookmark tmp_pdf_file, bookmark_title
|
||||
end
|
||||
|
||||
# Now we can move the file to the proper PDF filename
|
||||
File.rename tmp_pdf_file, final_pdf_file
|
||||
File.rename tmp_pdf_file, file_name
|
||||
end
|
||||
|
||||
def self.save_chapter(tmp_path : String, chapter_id : String, chapter_title : String, cookie : String | Nil = nil, add_bookmark = true, strip_first_page = true)
|
||||
final_pdf_file = chapter_file_name chapter_id, tmp_path
|
||||
|
||||
if File.exists? final_pdf_file
|
||||
puts "#{chapter_id} already downloaded"
|
||||
return
|
||||
end
|
||||
|
||||
# TODO: Remove this hardcoding, and make this more generic by generating it within the Book class
|
||||
url = "https://muse.jhu.edu/chapter/#{chapter_id}/pdf"
|
||||
referer = "https://muse.jhu.edu/verify?url=%2Fchapter%2F#{chapter_id}%2Fpdf"
|
||||
|
||||
save_url(url, referer, final_pdf_file, tmp_path, cookie, chapter_title, strip_first_page)
|
||||
|
||||
puts "Downloaded #{chapter_id}"
|
||||
end
|
||||
|
||||
def self.get_info(url : String) : Muse::Dl::Thing | Nil
|
||||
match = /https:\/\/muse.jhu.edu\/(book|journal)\/(\d+)/.match url
|
||||
def self.save_article(tmp_path : String, article_id : String, cookie : String | Nil = nil, article_title = nil, strip_first_page = true)
|
||||
file_name = article_file_name article_id, tmp_path
|
||||
url = "https://muse.jhu.edu/article/#{article_id}/pdf"
|
||||
referer = "https://muse.jhu.edu/article/#{article_id}"
|
||||
save_url(url, referer, file_name, tmp_path, cookie, article_title, strip_first_page)
|
||||
end
|
||||
|
||||
def self.get_info(url : String)
|
||||
match = /https:\/\/muse.jhu.edu\/(book|journal|issue|article)\/(\d+)/.match url
|
||||
if match
|
||||
begin
|
||||
response = Crest.get(url).to_s
|
||||
|
@ -108,6 +136,10 @@ module Muse::Dl
|
|||
return Muse::Dl::Book.new response
|
||||
when "journal"
|
||||
return Muse::Dl::Journal.new response
|
||||
when "issue"
|
||||
return Muse::Dl::Issue.new match[2], response
|
||||
when "article"
|
||||
return Muse::Dl::Article.new match[2]
|
||||
end
|
||||
rescue ex : Crest::NotFound
|
||||
raise Muse::Dl::Errors::InvalidLink.new("Error - could not download url: #{url}")
|
||||
|
|
|
@ -34,6 +34,18 @@ module Muse::Dl
|
|||
myhtml.css("#book_about_info .title").map(&.inner_text).to_a[0].strip
|
||||
end
|
||||
|
||||
def self.issue_title(myhtml : Myhtml::Parser)
|
||||
begin
|
||||
myhtml.css(".card_text .title").map(&.inner_text).to_a[0].strip
|
||||
rescue
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def self.journal_title(myhtml : Myhtml::Parser)
|
||||
myhtml.css("#journal_about_info .title").map(&.inner_text).to_a[0].strip
|
||||
end
|
||||
|
||||
def self.author(myhtml : Myhtml::Parser)
|
||||
myhtml.css("#book_about_info .author").map(&.inner_text).to_a[0].strip.gsub("<BR>", ", ").gsub("\n", " ")
|
||||
end
|
||||
|
@ -50,9 +62,13 @@ module Muse::Dl
|
|||
myhtml.css("#book_about_info .pub a").map(&.inner_text).to_a[0].strip
|
||||
end
|
||||
|
||||
def self.journal_publisher(myhtml : Myhtml::Parser)
|
||||
myhtml.css(".card_publisher a").map(&.inner_text).to_a[0].strip
|
||||
end
|
||||
|
||||
def self.summary(myhtml : Myhtml::Parser)
|
||||
begin
|
||||
return myhtml.css("#book_about_info .card_summary").map(&.inner_text).to_a[0].strip
|
||||
return myhtml.css(".card_summary").map(&.inner_text).to_a[0].strip
|
||||
rescue e : Exception
|
||||
STDERR.puts "Could not fetch summary"
|
||||
return "NA"
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
"./thing.cr"
|
||||
require "./fetch.cr"
|
||||
require "./article.cr"
|
||||
|
||||
module Muse::Dl
|
||||
class Issue
|
||||
getter id : String,
|
||||
title : String | Nil,
|
||||
articles : Array(Muse::Dl::Article),
|
||||
url : String,
|
||||
summary : String | Nil,
|
||||
publisher : String | Nil,
|
||||
info : Hash(String, String),
|
||||
volume : String | Nil,
|
||||
number : String | Nil,
|
||||
date : String | Nil,
|
||||
journal_title : String | Nil
|
||||
|
||||
setter :journal_title
|
||||
|
||||
def initialize(id : String, response : String | Nil = nil)
|
||||
@id = id
|
||||
@url = "https://muse.jhu.edu/issue/#{id}"
|
||||
@articles = [] of Muse::Dl::Article
|
||||
parse(response) if response
|
||||
@info = Hash(String, String).new
|
||||
end
|
||||
|
||||
def open_access
|
||||
if @info.has_key? "Open Access"
|
||||
return @info["Open Access"] == "Yes"
|
||||
end
|
||||
false
|
||||
end
|
||||
|
||||
def parse
|
||||
html = Crest.get(@url).to_s
|
||||
parse(html)
|
||||
end
|
||||
|
||||
def parse(html : String)
|
||||
h = Myhtml::Parser.new html
|
||||
@info = InfoParser.infobox(h)
|
||||
@title = InfoParser.issue_title(h)
|
||||
@summary = InfoParser.summary(h)
|
||||
@publisher = InfoParser.journal_publisher(h)
|
||||
parse_title
|
||||
parse_contents(h)
|
||||
end
|
||||
|
||||
def parse_title
|
||||
t = @title
|
||||
unless t.nil?
|
||||
@volume = /Volume (\d+)/.match(t).try &.[1]
|
||||
@number = /Number (\d+)/.match(t).try &.[1]
|
||||
@number = /Issue (\d+)/.match(t).try &.[1] unless @number
|
||||
@date = /((January|February|March|April|May|June|July|August|September|October|November|December|Sring|Winter|Fall|Summer) (\d+))/.match(t).try &.[1]
|
||||
@date = /(\d{4})/.match(t).try &.[1] unless @date
|
||||
end
|
||||
end
|
||||
|
||||
def parse_contents(myhtml : Myhtml::Parser)
|
||||
unless @journal_title
|
||||
journal_title_a = myhtml.css("#journal_banner_title a").first
|
||||
if journal_title_a
|
||||
@journal_title = journal_title_a.inner_text
|
||||
end
|
||||
end
|
||||
myhtml.css(".articles_list_text ol").each do |ol|
|
||||
link = ol.css("li.title a").first
|
||||
title = link.inner_text
|
||||
|
||||
pages = ol.css("li.pg")
|
||||
if pages.size > 0
|
||||
p = pages.first.try &.inner_text
|
||||
matches = /(\d+)-(\d+)/.match p
|
||||
if matches
|
||||
start_page = matches[1].to_i
|
||||
end_page = matches[2].to_i
|
||||
end
|
||||
end
|
||||
|
||||
ol.css("a").each do |l|
|
||||
url = l.attribute_by("href").to_s
|
||||
matches = /\/article\/(\d+)\/pdf/.match url
|
||||
if matches
|
||||
a = Muse::Dl::Article.new matches[1]
|
||||
a.title = title
|
||||
a.start_page = start_page if start_page
|
||||
a.end_page = end_page if end_page
|
||||
@articles.push a
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,6 +1,44 @@
|
|||
require "./thing.cr"
|
||||
require "./infoparser.cr"
|
||||
require "./issue.cr"
|
||||
|
||||
module Muse::Dl
|
||||
class Journal < Muse::Dl::Thing
|
||||
class Journal
|
||||
getter :info, :summary, :publisher, :issues, :title
|
||||
@info = Hash(String, String).new
|
||||
@summary : String
|
||||
@publisher : String
|
||||
@issues = [] of Muse::Dl::Issue
|
||||
@title : String
|
||||
|
||||
private getter :h
|
||||
|
||||
def initialize(html)
|
||||
@h = Myhtml::Parser.new html
|
||||
@info = InfoParser.infobox(h)
|
||||
@summary = InfoParser.summary(h)
|
||||
@publisher = InfoParser.journal_publisher(h)
|
||||
@title = InfoParser.journal_title(h)
|
||||
parse_volumes(h)
|
||||
end
|
||||
|
||||
def open_access
|
||||
if @info.has_key? "Open Access"
|
||||
return @info["Open Access"] == "Yes"
|
||||
end
|
||||
false
|
||||
end
|
||||
|
||||
def parse_volumes(myhtml : Myhtml::Parser)
|
||||
myhtml.css("#available_issues_list_text a").each do |a|
|
||||
link = a.attribute_by("href").to_s
|
||||
|
||||
matches = /\/issue\/(\d+)/.match link
|
||||
if matches
|
||||
issue = Muse::Dl::Issue.new matches[1]
|
||||
issue.journal_title = @title
|
||||
@issues.push issue
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -4,6 +4,7 @@ require "./fetch.cr"
|
|||
require "./book.cr"
|
||||
require "./journal.cr"
|
||||
require "./util.cr"
|
||||
require "file_utils"
|
||||
|
||||
module Muse::Dl
|
||||
VERSION = "1.1.2"
|
||||
|
@ -11,9 +12,15 @@ module Muse::Dl
|
|||
class Main
|
||||
def self.dl(parser : Parser)
|
||||
url = parser.url
|
||||
puts "Downloading #{url}"
|
||||
thing = Fetch.get_info(url) if url
|
||||
return unless thing
|
||||
|
||||
if (thing.open_access) && (parser.skip_oa)
|
||||
STDERR.puts "Skipping #{url}, available under Open Access"
|
||||
return
|
||||
end
|
||||
|
||||
if thing.is_a? Muse::Dl::Book
|
||||
unless thing.formats.includes? :pdf
|
||||
STDERR.puts "Book not available in PDF format, skipping: #{url}"
|
||||
|
@ -30,7 +37,6 @@ module Muse::Dl
|
|||
temp_stitched_file = nil
|
||||
pdf_builder = Pdftk.new(parser.tmp)
|
||||
|
||||
unless parser.input_pdf
|
||||
# Save each chapter
|
||||
thing.chapters.each do |chapter|
|
||||
begin
|
||||
|
@ -45,13 +51,9 @@ module Muse::Dl
|
|||
# Stitch the PDFs together
|
||||
temp_stitched_file = pdf_builder.stitch chapter_ids
|
||||
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
||||
else
|
||||
x = parser.input_pdf
|
||||
pdf_builder.add_metadata(File.open(x), parser.output, thing) if x
|
||||
end
|
||||
|
||||
temp_stitched_file.delete if temp_stitched_file
|
||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors."
|
||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
|
||||
puts "DL: #{url}. Saved final output to #{parser.output}"
|
||||
|
||||
# Cleanup the chapter files
|
||||
|
@ -60,6 +62,69 @@ module Muse::Dl
|
|||
Fetch.cleanup(parser.tmp, c[0])
|
||||
end
|
||||
end
|
||||
elsif thing.is_a? Muse::Dl::Article
|
||||
# No bookmarks are needed since this is just a single article PDF
|
||||
begin
|
||||
Fetch.save_article(parser.tmp, thing.id, parser.cookie, nil, parser.strip_first)
|
||||
rescue e : Muse::Dl::Errors::MuseCorruptPDF
|
||||
STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
|
||||
return
|
||||
end
|
||||
|
||||
# TODO: Move this code elsewhere
|
||||
source = Fetch.article_file_name(thing.id, parser.tmp)
|
||||
destination = "article-#{thing.id}.pdf"
|
||||
# Needed because of https://github.com/crystal-lang/crystal/issues/7777
|
||||
FileUtils.cp source, destination
|
||||
FileUtils.rm source if parser.cleanup
|
||||
elsif thing.is_a? Muse::Dl::Issue
|
||||
# Will have no effect if parser has a custom title
|
||||
parser.force_set_output Util.slug_filename "#{thing.journal_title} - #{thing.title}.pdf"
|
||||
|
||||
# If file exists and we can't clobber
|
||||
if File.exists?(parser.output) && parser.clobber == false
|
||||
STDERR.puts "Skipping #{url}, File already exists: #{parser.output}"
|
||||
return
|
||||
end
|
||||
temp_stitched_file = nil
|
||||
pdf_builder = Pdftk.new(parser.tmp)
|
||||
|
||||
thing.articles.each do |article|
|
||||
begin
|
||||
Fetch.save_article(parser.tmp, article.id, parser.cookie, article.title, parser.strip_first)
|
||||
rescue e : Muse::Dl::Errors::MuseCorruptPDF
|
||||
STDERR.puts "Got a 'Unable to construct chapter PDF' error from MUSE, skipping: #{url}"
|
||||
return
|
||||
end
|
||||
end
|
||||
article_ids = thing.articles.map { |a| a.id }
|
||||
|
||||
# Stitch the PDFs together
|
||||
temp_stitched_file = pdf_builder.stitch_articles article_ids
|
||||
pdf_builder.add_metadata(temp_stitched_file, parser.output, thing)
|
||||
|
||||
# temp_stitched_file.delete if temp_stitched_file
|
||||
puts "--dont-strip-first-page was on. Please validate PDF file for any errors." unless parser.strip_first
|
||||
puts "DL: #{url}. Saved final output to #{parser.output}"
|
||||
|
||||
# Cleanup the issue files
|
||||
if parser.cleanup
|
||||
thing.articles.each do |a|
|
||||
Fetch.cleanup_articles(parser.tmp, a.id)
|
||||
end
|
||||
end
|
||||
elsif thing.is_a? Muse::Dl::Journal
|
||||
thing.issues.each do |issue|
|
||||
begin
|
||||
# Update the issue
|
||||
issue.parse
|
||||
parser.url = issue.url
|
||||
Main.dl parser
|
||||
rescue e
|
||||
puts e.message
|
||||
puts "Faced an exception with previous issue, continuing"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -10,15 +10,15 @@ module Muse::Dl
|
|||
@strip_first = true
|
||||
@output = DEFAULT_FILE_NAME
|
||||
@url : String | Nil
|
||||
@input_pdf : String | Nil
|
||||
@clobber = false
|
||||
@input_list : String | Nil
|
||||
@cookie : String | Nil
|
||||
@h : Bool | Nil
|
||||
@skip_oa = false
|
||||
|
||||
DEFAULT_FILE_NAME = "tempfilename.pdf"
|
||||
|
||||
getter :bookmarks, :tmp, :cleanup, :output, :url, :input_pdf, :clobber, :input_list, :cookie, :strip_first
|
||||
getter :bookmarks, :tmp, :cleanup, :output, :url, :clobber, :input_list, :cookie, :strip_first, :skip_oa
|
||||
setter :url
|
||||
|
||||
# Update the output filename unless we have a custom one passed
|
||||
|
@ -26,6 +26,10 @@ module Muse::Dl
|
|||
@output = output_file unless @output != DEFAULT_FILE_NAME
|
||||
end
|
||||
|
||||
def force_set_output(output_file : String)
|
||||
@output = output_file
|
||||
end
|
||||
|
||||
def reset_output_file
|
||||
@output = DEFAULT_FILE_NAME
|
||||
end
|
||||
|
@ -41,7 +45,6 @@ module Muse::Dl
|
|||
|
||||
def initialize(arg : Array(String) = [] of String)
|
||||
@tmp = Dir.tempdir
|
||||
@input_pdf = nil
|
||||
|
||||
parser = OptionParser.new
|
||||
parser.banner = <<-EOT
|
||||
|
@ -56,10 +59,10 @@ module Muse::Dl
|
|||
parser.on(long_flag = "--tmp-dir PATH", description = "Temporary Directory to use") { |path| @tmp = path }
|
||||
parser.on(long_flag = "--output FILE", description = "Output Filename") { |file| @output = file }
|
||||
parser.on(long_flag = "--no-bookmarks", description = "Don't add bookmarks in the PDF") { @bookmarks = false }
|
||||
parser.on(long_flag = "--input-pdf INPUT", description = "Input Stitched PDF. Will not download anything") { |input| @input_pdf = input }
|
||||
parser.on(long_flag = "--clobber", description = "Overwrite the output file, if it already exists. Not compatible with input-pdf") { @clobber = true }
|
||||
parser.on(long_flag = "--dont-strip-first-page", description = "Disables first page from being stripped. Use carefully") { @strip_first = false }
|
||||
parser.on(long_flag = "--cookie COOKIE", description = "Cookie-header") { |cookie| @cookie = cookie }
|
||||
parser.on(long_flag = "--skip-open-access", description = "Don't download open access content") { @skip_oa = true }
|
||||
parser.on("-h", "--help", "Show this help") { @h = true; puts parser }
|
||||
|
||||
parser.unknown_args do |args|
|
||||
|
@ -70,7 +73,6 @@ module Muse::Dl
|
|||
end
|
||||
if File.exists? args[0]
|
||||
@input_list = args[0]
|
||||
@input_pdf = nil
|
||||
else
|
||||
@url = args[0]
|
||||
end
|
||||
|
|
90
src/pdftk.cr
90
src/pdftk.cr
|
@ -70,7 +70,6 @@ module Muse::Dl
|
|||
|
||||
def add_metadata(input_file : File, output_file : String, book : Book)
|
||||
# First we have to dump the current metadata
|
||||
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
||||
keywords = "Publisher:#{book.publisher}, Published:#{book.date}"
|
||||
|
||||
# Known Info keys, if they are present
|
||||
|
@ -80,34 +79,51 @@ module Muse::Dl
|
|||
end
|
||||
end
|
||||
|
||||
text = <<-EOT
|
||||
metadata_text = gen_metadata(book.title, keywords, book.summary.gsub(/\n\s+/, " "), book.author)
|
||||
write_metadata(input_file, output_file, metadata_text)
|
||||
end
|
||||
|
||||
def gen_metadata(title : String, keywords : String, subject : String, author : String | Nil = nil)
|
||||
metadata = <<-EOT
|
||||
InfoBegin
|
||||
InfoKey: Creator
|
||||
InfoValue: Project MUSE (https://muse.jhu.edu/)
|
||||
InfoValue:
|
||||
InfoBegin
|
||||
InfoKey: Producer
|
||||
InfoValue: Muse-DL/#{Muse::Dl::VERSION}
|
||||
InfoValue:
|
||||
InfoBegin
|
||||
InfoKey: Title
|
||||
InfoValue: #{book.title}
|
||||
InfoValue: #{title}
|
||||
InfoBegin
|
||||
InfoKey: Keywords
|
||||
InfoValue: #{keywords}
|
||||
InfoBegin
|
||||
InfoKey: Author
|
||||
InfoValue: #{book.author}
|
||||
InfoBegin
|
||||
InfoKey: Subject
|
||||
InfoValue: #{book.summary.gsub(/\n\s+/, " ")}
|
||||
InfoValue: #{subject}
|
||||
InfoBegin
|
||||
InfoKey: ModDate
|
||||
InfoValue:
|
||||
InfoBegin
|
||||
InfoKey: CreationDate
|
||||
InfoValue:
|
||||
|
||||
EOT
|
||||
|
||||
unless author.nil?
|
||||
metadata += <<-EOT
|
||||
InfoBegin
|
||||
InfoKey: Author
|
||||
InfoValue: #{author}
|
||||
EOT
|
||||
end
|
||||
|
||||
return metadata
|
||||
end
|
||||
|
||||
def write_metadata(input_file : File, output_file : String, text)
|
||||
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
||||
File.write(metadata_text_file.path, text)
|
||||
|
||||
is_success = execute [input_file.path, "update_info_utf8", metadata_text_file.path, "output", output_file]
|
||||
if !is_success
|
||||
raise Muse::Dl::Errors::PDFOperationError.new("Error adding metadata to book.")
|
||||
|
@ -115,11 +131,42 @@ module Muse::Dl
|
|||
metadata_text_file.delete
|
||||
end
|
||||
|
||||
def add_metadata(input_file : File, output_file : String, issue : Issue)
|
||||
# First we have to dump the current metadata
|
||||
metadata_text_file = File.tempfile("muse-dl-metadata-tmp", ".txt")
|
||||
keywords = "Journal:#{issue.journal_title}, Published:#{issue.date},Volume:#{issue.volume},Number:#{issue.number}"
|
||||
["ISSN", "Print ISSN", "DOI", "Language", "Open Access"].each do |label|
|
||||
if issue.info.has_key? label
|
||||
keywords += ", #{label}:#{issue.info[label]}"
|
||||
end
|
||||
end
|
||||
|
||||
# TODO: Move this to Issue class
|
||||
|
||||
s = issue.summary
|
||||
unless s.nil?
|
||||
summary = s.gsub(/\n\s+/, " ")
|
||||
else
|
||||
summary = "NA"
|
||||
end
|
||||
|
||||
t = issue.title
|
||||
|
||||
unless t.nil?
|
||||
title = t
|
||||
else
|
||||
title = "NA"
|
||||
end
|
||||
# TODO: Add support for all authors in the PDF
|
||||
metadata = gen_metadata(title, keywords, summary)
|
||||
write_metadata(input_file, output_file, metadata)
|
||||
end
|
||||
|
||||
def stitch(chapter_ids : Array(String))
|
||||
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
|
||||
# Do some sanity checks on each Chapter PDF
|
||||
chapter_ids.each do |id|
|
||||
raise Muse::Dl::Errors::MissingChapter.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
|
||||
raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.chapter_file_name(id, @tmp_file_path)
|
||||
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.chapter_file_name(id, @tmp_file_path)) > 0
|
||||
end
|
||||
|
||||
|
@ -136,5 +183,28 @@ module Muse::Dl
|
|||
|
||||
return output_file
|
||||
end
|
||||
|
||||
# TODO: Merge with stitch
|
||||
def stitch_articles(article_ids : Array(String))
|
||||
output_file = File.tempfile("muse-dl-stitched-tmp", ".pdf")
|
||||
# Do some sanity checks on each Chapter PDF
|
||||
article_ids.each do |id|
|
||||
raise Muse::Dl::Errors::MissingFile.new unless File.exists? Fetch.article_file_name(id, @tmp_file_path)
|
||||
raise Muse::Dl::Errors::CorruptFile.new unless File.size(Fetch.article_file_name(id, @tmp_file_path)) > 0
|
||||
end
|
||||
|
||||
# Now let's stitch them together
|
||||
article_files = article_ids.map { |id| Fetch.article_file_name(id, @tmp_file_path) }
|
||||
args = article_files + ["cat", "output", output_file.path]
|
||||
is_success = execute args
|
||||
|
||||
# TODO: Validate final file here
|
||||
if !is_success
|
||||
puts args
|
||||
raise Muse::Dl::Errors::PDFOperationError.new("Error stitching articles together.")
|
||||
end
|
||||
|
||||
return output_file
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -19,6 +19,13 @@ module Muse::Dl
|
|||
|
||||
private getter :h
|
||||
|
||||
def open_access
|
||||
if @info.has_key? "Open Access"
|
||||
return @info["Open Access"] == "Yes"
|
||||
end
|
||||
false
|
||||
end
|
||||
|
||||
def initialize(html : String)
|
||||
@h = Myhtml::Parser.new html
|
||||
@info = InfoParser.infobox(h)
|
||||
|
|
|
@ -2,7 +2,7 @@ module Muse::Dl
|
|||
class Util
|
||||
# Generates a safe filename
|
||||
def self.slug_filename(input : String)
|
||||
input.strip.tr("\u{202E}%$|:;/\t\r\n\\", "-")
|
||||
input.strip.tr("\u{202E}%$|:;/\"\t\r\n\\", "-")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue