Switches to Web Archive, since book is no longer accessible (#2)

* The author has removed the book from her website.
Gone, but not forgotten... by archive.org :)

* Specify UTF-8 with pup to work around it replacing ' with ’

* Oops, pup gets called twice. Let's fix them both.
This commit is contained in:
John Wood 2020-07-29 01:00:24 +09:00 committed by GitHub
parent 85177f41a0
commit a4b44684e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 4 additions and 4 deletions

View File

@ -13,7 +13,7 @@ LC=${LC:-""}
if [[ "$LC" != "" ]]; then
LC="/$LC"
fi
MAIN_STORY_URL="https://www.theickabog.com$LC/read-the-story/"
MAIN_STORY_URL="https://web.archive.org/web/20200713135719/https://www.theickabog.com$LC/read-the-story/"
echo "[+] Fetching $MAIN_STORY_URL"
@ -31,14 +31,14 @@ echo "<html lang=$LANG><head><meta charset=UTF-8><title>$MAIN_TITLE</title></hea
# args = "$url" "$chapter" "$title"
function download_chapter() {
[[ $2 =~ 1$ ]] && MAIN_TITLE=$3
URL=$( [[ $1 =~ ^http ]] && echo "$1" || echo "https://www.theickabog.com$1" )
URL=$( [[ $1 =~ ^http ]] && echo "$1" || echo "https://web.archive.org$1" )
[ -s "html/$2.html" ] || wget --quiet "$URL" -O "html/$2.html"
echo "<h1>$3</h1>" >> "$HTML_FILE"
cat "html/$2.html" | pup 'article div.row:nth-child(2) div.entry-content' >> "$HTML_FILE"
cat "html/$2.html" | pup -p --charset UTF-8 'article div.row:nth-child(2) div.entry-content' >> "$HTML_FILE"
}
cat "$MAIN_STORY_OUTPUT_FILE" |
pup 'ul.chapters__list a json{}' |
pup -p --charset UTF-8 'ul.chapters__list a json{}' |
jq -r '[.[] | {url: .href, chapter: .children[0].children[0].children[0].children[0].text, title: .children[0].children[0].children[0].children[1].text}] | sort_by(.chapter | match("[0-9]+$")) | .[]|[.chapter, .title, .url] | @tsv' |
while IFS=$'\t' read -r chapter title url; do download_chapter "$url" "$chapter" "$title"; done