From 35976d1f3206721d94855e6a321cd0cb8937e211 Mon Sep 17 00:00:00 2001
From: Nemo <commits@captnemo.in>
Date: Tue, 03 Dec 2024 16:46:20 +0530
Subject: [PATCH] WIP: crystal port

---
 .gitignore |   2 ++
 epub.cr    | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 wat.cr     |  57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 202 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2eaa65f..0baa38a 100644
--- a/.gitignore
+++ a/.gitignore
@@ -16,3 +16,5 @@
 wat/*.opus
 wat/*.json
 !books/wat-audio.txt
+wat.epub
+wat/
diff --git a/epub.cr b/epub.cr
new file mode 100644
index 0000000..a93c1f3 100644
--- /dev/null
+++ a/epub.cr
@@ -1,0 +1,154 @@
+require "compress/zip"
+require "http/client"
+
+class EPUBGenerator
+  def initialize(input_dir : String, output_file : String)
+    @input_dir = input_dir
+    @output_file = output_file
+    @images_dir = File.join(@input_dir, "images")
+    Dir.mkdir_p(@images_dir)
+
+    @metadata = <<-XML
+    <dc:identifier id="epub-id-1">1250319188</dc:identifier>
+    <dc:identifier id="epub-id-2">978-1250319180</dc:identifier>
+    <dc:title id="epub-title-1">Wind and Truth: Book Five of the Stormlight Archive</dc:title>
+    <dc:date>2024-12-06</dc:date>
+    <dc:language>en-US</dc:language>
+    <dc:creator id="epub-creator-1">Brandon Sanderson</dc:creator>
+    <meta property="dcterms:modified">2024-11-03T15:18:10Z</meta>
+    XML
+
+    @html_files = Dir.glob(File.join(@input_dir, "*.html")).sort
+    @title = "Wind and Truth: Book Five of the Stormlight Archive"
+  end
+
+  private def parse_metadata(xml : String) : Hash(String, String)
+    metadata = Hash(String, String).new
+    doc = XML.parse(xml)
+
+    doc.root.children.each do |child|
+      next unless child.is_a?(XML::Element)
+      metadata[child.name] = child.text
+    end
+
+    metadata
+  end
+
+  private def download_images(html_file : String) : String
+    html = File.read(html_file)
+    updated_html = html.dup
+    image_urls = html.scan(/<img\s[^>]*src=["'](https?:\/\/[^"']+)["']/).map(&.first)
+
+    image_urls.each_with_index do |url, index|
+      file_ext = File.extname(URI.parse(url).path)
+      local_file = File.join(@images_dir, "image_#{index + 1}#{file_ext}")
+
+      # Download image
+      HTTP::Client.get(url) do |response|
+        File.open(local_file, "w") { |f| f.write(response.body_io) }
+      end
+
+      # Replace URL in HTML
+      updated_html.gsub!(url, "images/#{File.basename(local_file)}")
+    end
+
+    updated_html
+  end
+
+  private def create_container
+    <<-XML
+    <?xml version="1.0" encoding="UTF-8"?>
+    <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+      <rootfiles>
+        <rootfile full-path="content.opf" media-type="application/oebps-package+xml"/>
+      </rootfiles>
+    </container>
+    XML
+  end
+
+  private def generate_content_opf
+    manifest = @html_files.map_with_index do |file, index|
+      <<-XML
+      <item id="chapter#{index + 1}" href="chapters/#{File.basename(file)}" media-type="application/xhtml+xml"/>
+      XML
+    end.join
+
+    spine = @html_files.map_with_index do |_, index|
+      <<-XML
+      <itemref idref="chapter#{index + 1}"/>
+      XML
+    end.join
+
+    <<-XML
+    <?xml version="1.0" encoding="UTF-8"?>
+    <package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="epub-id-1" prefix="ibooks: http://vocabulary.itunes.apple.com/rdf/ibooks/vocabulary-extensions-1.0/">
+    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
+    #{@metadata}
+    </metadata>
+    <manifest>
+    #{manifest}
+    </manifest>
+    <spine>
+    #{spine}
+    </spine>
+    </package>
+    XML
+  end
+
+  private def generate_toc
+    toc_items = @html_files.map_with_index do |file, index|
+      <<-HTML
+      <li><a href="chapters/#{File.basename(file)}">Chapter #{index + 1}</a></li>
+      HTML
+    end.join
+
+    <<-HTML
+    <?xml version="1.0" encoding="UTF-8"?>
+    <!DOCTYPE html>
+    <html xmlns="http://www.w3.org/1999/xhtml">
+      <head>
+        <title>Table of Contents</title>
+      </head>
+      <body>
+        <nav epub:type="toc" id="toc">
+          <h1>#{@title}</h1>
+          <ol>
+            #{toc_items}
+          </ol>
+        </nav>
+      </body>
+    </html>
+    HTML
+  end
+
+  def generate
+    File.open(@output_file, "w") do |file|
+      Compress::Zip::Writer.open(file) do |zip|
+        # ERROR(PKG-007): ./wat.epub(-1,-1): Mimetype file should only contain
+        # the string "application/epub+zip" and should not be compressed.
+        entry = Compress::Zip::Writer::Entry.new("mimetype")
+        entry.compression_method = Compress::Zip::CompressionMethod::STORED
+        entry.compressed_size = 20_u32
+        entry.uncompressed_size = 20_u32
+        entry.crc32 = 749429103_u32
+        zip.add(entry) do |io|
+          io.write_string("application/epub+zip".to_slice)
+          io.close
+        end
+
+        zip.add("META-INF/container.xml", create_container)
+        zip.add("content.opf", generate_content_opf)
+        zip.add("toc.xhtml", generate_toc)
+
+        @html_files.each do |chapter|
+          puts chapter
+          zip.add("chapters/#{File.basename(chapter)}", File.open(chapter))
+        end
+
+        Dir.glob("images/*").each do |image|
+          zip.add("images/#{File.basename(image)}", File.open(image))
+        end
+      end
+    end
+  end
+end
diff --git a/wat.cr b/wat.cr
index 6b55e9b..4902d74 100644
--- a/wat.cr
+++ a/wat.cr
@@ -1,7 +1,9 @@
 require "http/client"
 require "dir"
 require "lexbor"
-Dir.mkdir_p("wat")
+require "./epub"
+
+Dir.mkdir_p("wat/chapters")
 BASE = "https://reactormag.com/read-wind-and-truth-by-brandon-sanderson-"
 
 LINKS = [
@@ -23,7 +25,7 @@
   "chapters-29-and-30/",
   "chapters-31-and-32/",
   "chapter-33/",
-  "interludes-3-and-4",
+  "interludes-3-and-4/",
 ]
 
 # Automatically adds all recent chapter
@@ -49,30 +51,63 @@
 
 # # Now we have all the files
 html = ""
+
 (1..(LINKS.size)).each do |i|
   page_html = File.open("wat/#{i}.html").gets_to_end
   page = Lexbor::Parser.new(page_html).css("article-content")
-  start = ending = false
+  start = false
 
   page.first.children.each do |e|
+    # puts e.tag_name
     if e.tag_name == "h3"
-      # Create a new Lexbor H1 node instead
-      e = Lexbor::Parser.new("<h1>#{e.inner_html}</h1>").root!
+      e2 = Lexbor::Parser.new("<h1>#{e.inner_html}</h1>").nodes(:h1).first
+      e2.inner_html = e.inner_html
+      e = e2
+    end
 
+    if e.tag_name == "h1" || e.tag_name == "h3"
       start = true
     end
+
     # Chapter Arch heading images
     if e.tag_name == "figure" && e["class"].includes?("wp-block-image")
       start = true
     end
 
-    ending = true if e.tag_text.includes?("Excerpted") && start
+    if start && e.tag_text.includes?("Excerpted")
+      break
+    elsif start
+      html += e.to_html
+    end
+  end
+end
 
-    e.remove! if !start || ending
+file_chapter_index = 1
+split_html = ""
+Lexbor::Parser.new(html).nodes(:body).first.children.each do |ee|
+  if ee.tag_name == "figure" && ee["class"].includes?("wp-block-image") && split_html != ""
+    File.write("wat/chapters/#{file_chapter_index}.html", <<-XHTML
+    <?xml version="1.0" encoding="UTF-8"?>
+    <!DOCTYPE html>
+    <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
+    <head>
+      <meta charset="utf-8" />
+      <meta name="generator" content="github/captn3m0/cosmere-ebooks" />
+      <title>Untitled</title>
+    </head>
+    <body epub:type="bodymatter">
+    <section id="section" class="level1 unnumbered">
+    #{split_html}
+    </section>
+    </body>
+    </html>
+    XHTML
+    )
+    split_html = ""
+    file_chapter_index += 1
   end
-  chapter_html = page.first.inner_html
-    .sub(/<h1/, "<h1 id='chapter-#{i - 1}'")
-  html += chapter_html
+  split_html += ee.to_html
 end
 
-File.write("books/wat2.html", html)
+generator = EPUBGenerator.new("wat/chapters", "wat.epub")
+generator.generate
--
rgit 0.1.5