Switches to Web Archive, since book is no longer accessible (#2)

* The author has removed the book from her website. Gone, but not forgotten... by archive.org :) * Specify UTF-8 with pup to work around it replacing ' with â€™ * Oops, pup gets called twice. Let's fix them both.
2024-09-07 22:46:32 +00:00 · 2020-07-29 01:00:24 +09:00 · 2020-07-29 01:00:24 +09:00 · a4b44684e4
commit a4b44684e4
parent 85177f41a0
1 changed files with 4 additions and 4 deletions
--- a/generate.sh
+++ b/generate.sh
@ -13,7 +13,7 @@ LC=${LC:-""}
 if [[ "$LC" != "" ]]; then
    LC="/$LC"
 fi
-MAIN_STORY_URL="https://www.theickabog.com$LC/read-the-story/"
+MAIN_STORY_URL="https://web.archive.org/web/20200713135719/https://www.theickabog.com$LC/read-the-story/"

 echo "[+] Fetching $MAIN_STORY_URL"

@ -31,14 +31,14 @@ echo "<html lang=$LANG><head><meta charset=UTF-8><title>$MAIN_TITLE</title></hea
 # args = "$url" "$chapter" "$title"
 function download_chapter() {
    [[ $2 =~ 1$ ]] && MAIN_TITLE=$3
-    URL=$( [[ $1 =~ ^http ]] && echo "$1" || echo "https://www.theickabog.com$1" )
+    URL=$( [[ $1 =~ ^http ]] && echo "$1" || echo "https://web.archive.org$1" )
    [ -s "html/$2.html" ] || wget --quiet "$URL" -O "html/$2.html"
    echo "<h1>$3</h1>" >> "$HTML_FILE"
-    cat "html/$2.html" | pup 'article div.row:nth-child(2) div.entry-content' >> "$HTML_FILE"
+    cat "html/$2.html" | pup -p --charset UTF-8 'article div.row:nth-child(2) div.entry-content' >> "$HTML_FILE"
 }

 cat "$MAIN_STORY_OUTPUT_FILE" |
-pup 'ul.chapters__list a json{}' |
+pup -p --charset UTF-8 'ul.chapters__list a json{}' |
 jq -r '[.[] | {url: .href, chapter: .children[0].children[0].children[0].children[0].text, title: .children[0].children[0].children[0].children[1].text}] | sort_by(.chapter | match("[0-9]+$")) | .[]|[.chapter, .title, .url] | @tsv' |
 while IFS=$'\t' read -r chapter title url; do download_chapter "$url" "$chapter" "$title"; done