Fix Directory Structure Changes (#17)

* Improve wget invocation to get images as well * Fix relative links with new directory structure - A few links are still broken and tables don't render correctly yet - Fixes #14
2018-12-20 23:23:37 +05:30 · 2018-12-20 23:23:37 +05:30 · 6461924bfa
parent 56dd68f63d
commit 6461924bfa
3 changed files with 68 additions and 34 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 html/
 .direnv
 *.epub
+*.pdf
 *.mobi
 vendor/
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -1,26 +1,38 @@
 #!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'

-# Cleanup
-rm -rf html
-mkdir -p html/index
-mkdir -p html/sre-book
-cd html
+TOC_URL="https://landing.google.com/sre/sre-book/toc/index.html"
+# Make sure that links are relative \
+# # Remove the /sre/ directories
+# Save stuff in html/ directory
+# Do not create a landing.google.com directory
+# Enable recursion, timestamping (--mirror)
+# Images are hosted elsewhere, download them as well.
+# We need to go up a level from /toc/ where we start
+wget \
+    --convert-links \
+    --directory-prefix=html \
+    --page-requisites \
+    --adjust-extension \
+    --span-hosts \
+    --trust-server-names \
+    --backup-converted \
+    --mirror \
+    --no-verbose \
+    --recursive \
+    --domains=lh3.googleusercontent.com,landing.google.com https://landing.google.com/sre/sre-book/toc/index.html

-# Download
-wget --convert-links --mirror https://landing.google.com/sre/book/
-mv landing.google.com/sre/sre-book/* ./sre-book
-mv landing.google.com/sre/book/index.html ./index
-rm -rf landing.google.com
-cd ..
+MODE=${1:-}

-if [ $1 != "docker" ];then
+if [ "$MODE" != "docker" ];then
    bundle install
 fi

 ruby generate.rb

-pushd html/sre-book/chapters
-pandoc -f html -t epub -o ../../../google-sre.epub --epub-metadata=../../../metadata.xml --epub-cover-image=../../../cover.jpg sre.html
+pushd html/landing.google.com/sre/sre-book/toc
+pandoc -f html -t epub -o ../../../../../google-sre.epub --epub-metadata=../../../../../metadata.xml --epub-cover-image=../../../../../cover.jpg complete.html
 popd
 ebook-convert google-sre.epub google-sre.mobi
 ebook-convert google-sre.epub google-sre.pdf
--- a/generate.rb
+++ b/generate.rb
@ -1,17 +1,27 @@
 require 'nokogiri'
-require 'pp'
+require 'pathname'
 require 'fileutils'
+
 # First we get the list of all the book sections:

-chapter_links = Nokogiri::HTML(open("html/index/index.html"))
+Dir.chdir("html/landing.google.com/sre/sre-book/toc")
+chapter_links = Nokogiri::HTML(open("index.html"))
  .css('#drop-down a')
  .map {|l| l.attribute('href').value}

-html = ''
+html = <<EOT
+<!DOCTYPE html>
+<html>
+  <head>
+  <title>Site Reliability Engineering</title>
+  <meta charset="utf-8">
+  </head>
+  <body>
+EOT
 chapter_links.each do |chapter_link|
-  chapter_file = File.basename chapter_link
-  html += "<span class=\"hidden\" name=\"#{chapter_file}\"></span>"
-  doc = Nokogiri::HTML(open("html/index/#{chapter_link}"))
+  chapter_file = File.basename File.dirname chapter_link
+  html += "<span class=\"hidden\" id=\"#{chapter_file}\"></span>"
+  doc = Nokogiri::HTML(open(chapter_link))
  content = doc.css('.content')

  # this title is with additional 'chapter X' in front
@ -25,9 +35,18 @@ chapter_links.each do |chapter_link|
  content.css('a').each do |a|
    link = a.attribute('href')
    if link
-      matches = link.value.scan /^([\w-]+.html)#([\w-]+)$/
-      if matches.length == 1
-        a['href'] = '#' + matches[0][1]
+
+      matches = link.value.scan /^(\S*index.html)+(#[\w-]+)?/
+      # pp [link.value, matches] if link.value and link.value.include? 'lessons-learned'
+      if matches.length == 1 and matches[0].length == 2
+        # Self Links
+        if matches[0][0] =="index.html" and matches[0][1]
+          a['href'] = matches[0][1]
+        # If it points to start of a different chapter
+        else
+          chapter_slug = File.basename File.dirname matches[0][0]
+          a['href'] = "##{chapter_slug}"
+        end
      end
    end
  end
@ -36,11 +55,9 @@ chapter_links.each do |chapter_link|

  headers = (1..6).map {|x| "h#{x}"}

-  # headers.each_with_index
  content.css(headers.join(',')).each do |e|
    # If chapter heading
    if e == chapter_header
-      puts "Chapter Header"
      e.name = 'h1'
    else
      # Reduce everything by 1
@ -50,17 +67,17 @@ chapter_links.each do |chapter_link|
    end
  end

-  content.css('a').each do |a|
-    link = a.attribute('href')
-    if link
-      # Link to a direct chapter
-      matches = link.value.scan /^([\w-]+.html)$/
-      if matches.length == 1
-        a['href'] = '#' + matches[0][0]
-      end
+  content.css('img').each do |img|
+    img_file = img.attribute('src')
+    if img_file
+      chapter_directory = File.dirname chapter_link
+      absolute_image_path = Pathname.new File.absolute_path img_file, chapter_directory
+      cwd = Pathname.new Dir.pwd
+      img['src'] = absolute_image_path.relative_path_from cwd
    end
  end

+
  if content.children.css('section > h1').length > 0
    # remove additional parent section tag
    content = content.children.at_css('section')
@ -69,11 +86,15 @@ chapter_links.each do |chapter_link|
    content = content.children.at_css('div')
  end

+
+
  # replace h1 title
  content.at_css('h1').inner_html = title

  html += content.inner_html
 end

-File.open("html/sre-book/chapters/sre.html", 'w') { |file| file.write(html) }
+html+="</body></html>"
+
+File.open("complete.html", 'w') { |file| file.write(html) }
 puts "[html] Generated HTML file"