Improve generation by adapting code from @lesensei

- Auto generation of chapters, no manual updates
- Internationalization support, thanks to @lesensei
- Made qpdf, kindlegen, calibre optional dependencies
- Changed the cover image size to exactly A4
- Generatd cover.pdf, and switched to qpdf from pdftk
  I like pdftk, but the installation is stupidly hard these days
  and qpdf seems to be much more lightweight.
  The disadvantage is losing the bookmarks in the PDF

Also faced a few issues on pandoc while working on this, created
issues on the pandoc repo: https://github.com/jgm/pandoc/issues/created_by/captn3m0
This commit is contained in:
Nemo 2020-06-09 18:07:35 +05:30
parent 11f69c5370
commit 9b7b085805
7 changed files with 103 additions and 65 deletions

2
.gitignore vendored
View File

@ -2,3 +2,5 @@
*.epub
*.pdf
!cover.pdf
out/
metadata.xml

View File

@ -6,8 +6,9 @@ Generates ebooks for The Ickabog by J.K Rowling. Original text from https://www.
- `wget`
- [`pup`](https://github.com/ericchiang/pup)
- [`pandoc`](https://pandoc.org/)
- [`pdftk`]
- [`pandoc`](https://pandoc.org/) to generate EPUB and PDF files
- `qpdf` to add cover to PDF files. (optional)
- `kindlegen` or `calibre` installed to generate MOBI files. (optional)
## How to run
@ -29,6 +30,8 @@ The cover art is [Avanyu](http://edan.si.edu/saam/id/object/1979.144.85) by Juli
> Julian Martinez, Avanyu, ca. 1923, watercolor, ink, and pencil on paper, Smithsonian American Art Museum, Corbin-Henderson Collection, gift of Alice H. Rossin, 1979.144.85
Code for internationalization and automatic chapter updates via [@lesensei](https://github.com/lesensei/ickabog-ebook/commits/master)'s fork.
## License
The little code in this repository is licensed under the [MIT License](https://nemo.mit-license.org/). See LICENSE file for details.

BIN
cover.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 121 KiB

After

Width:  |  Height:  |  Size: 253 KiB

BIN
cover.ora

Binary file not shown.

BIN
cover.pdf

Binary file not shown.

View File

@ -1,77 +1,114 @@
#!/bin/bash
set -euo pipefail
IFS=$'\n\t'
OUTPUT_DIR=out
mkdir -p html
mkdir -p "$OUTPUT_DIR"
MAIN_STORY_OUTPUT_FILE="$OUTPUT_DIR/read-the-story.html"
HTML_FILE=ickabog.html
echo "<html><head><title>The Ickabog</title></head><body>" > "$HTML_FILE"
LC=${LC:-""}
if [[ "$LC" != "" ]]; then
LC="/$LC"
fi
MAIN_STORY_URL="https://www.theickabog.com$LC/read-the-story/"
echo "[+] Fetching $MAIN_STORY_URL"
wget --quiet "$MAIN_STORY_URL" --output-document "$MAIN_STORY_OUTPUT_FILE"
LANG=$(cat "$MAIN_STORY_OUTPUT_FILE"| pup 'html attr{lang}')
echo "[+] Language set to $LANG"
MAIN_TITLE=$(cat "$MAIN_STORY_OUTPUT_FILE" | pup 'ul.chapters__list a json{}' | jq -r '[.[] | {url: .href, chapter: .children[0].children[0].children[0].children[0].text, title: .children[0].children[0].children[0].children[1].text}] | sort_by(.chapter) | .[]|[.chapter, .title, .url] | @tsv' | grep $' 2\t' | while IFS=$'\t' read -r chapter title url; do echo "$title"; done)
echo "[+] Title set to $MAIN_TITLE"
echo "<html lang=$LANG><head><meta charset=UTF-8><title>$MAIN_TITLE</title></head><body>" > "$HTML_FILE"
# args = "$url" "$chapter" "$title"
function download_chapter() {
[ -s "html/$2.html" ] || wget --quiet "https://www.theickabog.com/$1" -O "html/$2.html"
[[ $2 =~ 1$ ]] && MAIN_TITLE=$3
URL=$( [[ $1 =~ ^http ]] && echo "$1" || echo "https://www.theickabog.com$1" )
[ -s "html/$2.html" ] || wget --quiet "$URL" -O "html/$2.html"
echo "<h1>$3</h1>" >> "$HTML_FILE"
cat "html/$2.html" | pup 'article div.row:nth-child(2) div.entry-content' >> "$HTML_FILE"
}
download_chapter king-fred-the-fearless/ ch1
download_chapter the-ickabog/ ch2
download_chapter death-of-a-seamstress/ ch3
download_chapter the-quiet-house/ ch4
download_chapter daisy-dovetail/ ch5
download_chapter the-fight-in-the-courtyard/ ch6
download_chapter lord-spittleworth-tells-tales/ ch7
download_chapter the-day-of-petition/ ch8
download_chapter the-shepherds-story/ ch9
download_chapter king-freds-quest/ ch10
download_chapter the-journey-north/ ch11
download_chapter the-kings-lost-sword/ ch12
download_chapter the-accident/ ch13
download_chapter lord-spittleworths-plan/ ch14
download_chapter the-king-returns/ ch15
download_chapter bert-says-goodbye/ ch16
download_chapter goodfellow-makes-a-stand/ ch17
download_chapter end-of-an-advisor/ ch18
download_chapter lady-eslanda/ ch19
download_chapter medals-for-beamish-and-buttons/ ch20
download_chapter professor-fraudysham/ ch21
for i in $(seq 1 21); do
CHAPTER_TITLE=$(cat "html/ch$i.html" | pup 'h1.entry-title:nth-child(2) text{}')
echo "<h2>$CHAPTER_TITLE</h2>" >> "$HTML_FILE"
cat "html/ch$i.html" | pup 'article div.row:nth-child(2) div.entry-content' >> "$HTML_FILE"
done
cat "$MAIN_STORY_OUTPUT_FILE" |
pup 'ul.chapters__list a json{}' |
jq -r '[.[] | {url: .href, chapter: .children[0].children[0].children[0].children[0].text, title: .children[0].children[0].children[0].children[1].text}] | sort_by(.chapter | match("[0-9]+$")) | .[]|[.chapter, .title, .url] | @tsv' |
while IFS=$'\t' read -r chapter title url; do download_chapter "$url" "$chapter" "$title"; done
echo "</body></html>" >> "$HTML_FILE"
pandoc --from=html --to=pdf \
--output=ickabog1.pdf \
--metadata title="The Ickabog" \
--metadata author="J.K Rowling" \
cat <<__METADATA__ > metadata.xml
<dc:creator opf:role="aut">J.K Rowling</dc:creator>
__METADATA__
pandoc --from=html \
--output="$OUTPUT_DIR/ickabog.epub" \
--epub-metadata=metadata.xml \
--epub-cover-image=cover.jpg \
--epub-chapter-level=1 \
"$HTML_FILE"
echo "[+] Generated $OUTPUT_DIR/ickabog.epub"
if command -v kindlegen > /dev/null; then
kindlegen "$OUTPUT_DIR/ickabog.epub" > /dev/null 2>&1
echo "[+] Generated MOBI using kindlegen: $OUTPUT_DIR/ickabog.mobi"
elif command -v ebook-convert > /dev/null; then
ebook-convert "$OUTPUT_DIR/ickabog.epub" \
"$OUTPUT_DIR/ickabog.mobi" \
--metadata title="$MAIN_TITLE" \
> /dev/null 2>&1
echo "[+] Generated MOBI using ebook-convert: $OUTPUT_DIR/ickabog.mobi"
else
echo "[-] Could not generate MOBI, install kindlegen or calibre"
fi
command -v xelatex >/dev/null && \
pandoc --from=html \
--pdf-engine=xelatex \
--dpi=300 \
-V book \
-V lang=en-US \
--metadata title="$MAIN_TITLE" \
--metadata author="J.K Rowling" \
--output="$OUTPUT_DIR/ickabog-no-cover.pdf" \
-V lang="$LANG" \
-V geometry=margin=1.5cm \
"$HTML_FILE"
pdftk cover.pdf ickabog1.pdf cat output ickabog.pdf
if command -v qpdf > /dev/null; then
qpdf --empty --pages cover.pdf "$OUTPUT_DIR/ickabog-no-cover.pdf" -- "$OUTPUT_DIR/ickabog.pdf"
else
mv "$OUTPUT_DIR/ickabog-no-cover.pdf" "$OUTPUT_DIR/ickabog.pdf"
fi
pandoc --from=html --to=epub \
--output=ickabog.epub \
--epub-metadata=metadata.xml \
--epub-cover-image=cover.jpg \
--metadata title="The Ickabog" \
"$HTML_FILE"
echo "[+] Generated PDF using xelatex: $OUTPUT_DIR/ickabog.pdf"
pandoc --from=html --to=pdf \
-V fontsize=18pt \
--output=ickabog2.pdf \
--metadata title="The Ickabog" \
--metadata author="J.K Rowling" \
--pdf-engine=context \
-V margin-left=0cm \
-V margin-right=0cm \
-V margin-top=0cm \
-V margin-bottom=0cm \
-V geometry=margin=0cm \
-V lang=en-US \
"$HTML_FILE"
# Run only if context is available
if command -v context>/dev/null; then
pandoc --from=html --to=pdf \
-V fontsize=18pt \
--output="$OUTPUT_DIR/ickabog-large-no-cover.pdf" \
--metadata title="$MAIN_TITLE" \
--metadata author="J.K Rowling" \
--pdf-engine=context \
-V margin-left=0cm \
-V margin-right=0cm \
-V margin-top=0cm \
-V margin-bottom=0cm \
-V geometry=margin=0cm \
-V lang="$LANG" \
"$HTML_FILE"
pdftk cover.pdf ickabog2.pdf cat output ickabog-large.pdf
if command -v qpdf > /dev/null; then
qpdf --empty --pages cover.pdf "$OUTPUT_DIR/ickabog-large-no-cover.pdf" -- "$OUTPUT_DIR/ickabog-large.pdf"
else
mv "$OUTPUT_DIR/ickabog-no-cover.pdf" "$OUTPUT_DIR/ickabog-large.pdf"
fi
fi
echo "[+] Generated PDF using context: $OUTPUT_DIR/ickabog-large.pdf"

View File

@ -1,4 +0,0 @@
<dc:title id="epub-title-1">The Ickabog</dc:title>
<dc:date>2020-05-20</dc:date>
<dc:language>en-US</dc:language>
<dc:creator id="epub-creator-1" opf:role="aut">The Ickabog</dc:creator>