Switches to Web Archive, since book is no longer accessible (#2)
* The author has removed the book from her website. Gone, but not forgotten... by archive.org :) * Specify UTF-8 with pup to work around it replacing ' with ’ * Oops, pup gets called twice. Let's fix them both.master
parent
85177f41a0
commit
a4b44684e4
|
@ -13,7 +13,7 @@ LC=${LC:-""}
|
|||
if [[ "$LC" != "" ]]; then
|
||||
LC="/$LC"
|
||||
fi
|
||||
MAIN_STORY_URL="https://www.theickabog.com$LC/read-the-story/"
|
||||
MAIN_STORY_URL="https://web.archive.org/web/20200713135719/https://www.theickabog.com$LC/read-the-story/"
|
||||
|
||||
echo "[+] Fetching $MAIN_STORY_URL"
|
||||
|
||||
|
@ -31,14 +31,14 @@ echo "<html lang=$LANG><head><meta charset=UTF-8><title>$MAIN_TITLE</title></hea
|
|||
# args = "$url" "$chapter" "$title"
|
||||
function download_chapter() {
|
||||
[[ $2 =~ 1$ ]] && MAIN_TITLE=$3
|
||||
URL=$( [[ $1 =~ ^http ]] && echo "$1" || echo "https://www.theickabog.com$1" )
|
||||
URL=$( [[ $1 =~ ^http ]] && echo "$1" || echo "https://web.archive.org$1" )
|
||||
[ -s "html/$2.html" ] || wget --quiet "$URL" -O "html/$2.html"
|
||||
echo "<h1>$3</h1>" >> "$HTML_FILE"
|
||||
cat "html/$2.html" | pup 'article div.row:nth-child(2) div.entry-content' >> "$HTML_FILE"
|
||||
cat "html/$2.html" | pup -p --charset UTF-8 'article div.row:nth-child(2) div.entry-content' >> "$HTML_FILE"
|
||||
}
|
||||
|
||||
cat "$MAIN_STORY_OUTPUT_FILE" |
|
||||
pup 'ul.chapters__list a json{}' |
|
||||
pup -p --charset UTF-8 'ul.chapters__list a json{}' |
|
||||
jq -r '[.[] | {url: .href, chapter: .children[0].children[0].children[0].children[0].text, title: .children[0].children[0].children[0].children[1].text}] | sort_by(.chapter | match("[0-9]+$")) | .[]|[.chapter, .title, .url] | @tsv' |
|
||||
while IFS=$'\t' read -r chapter title url; do download_chapter "$url" "$chapter" "$title"; done
|
||||
|
||||
|
|
Loading…
Reference in New Issue