github.com/captn3m0/cosmere-books.git

author	Nemo <commits@captnemo.in>	2024-12-03 16:46:20.0 +05:30:00
committer	Nemo <commits@captnemo.in>	2024-12-03 16:46:20.0 +05:30:00
commit	35976d1f3206721d94855e6a321cd0cb8937e211 [patch]
tree	e5d0d549e97de8b793a33739a55d14e2c8506d00
parent	7133a653bc6d6ec1b7be56d0b2c77af7999c06fe
download	35976d1f3206721d94855e6a321cd0cb8937e211.tar.gz
WIP: crystal port

Diff

 .gitignore |   2 ++
 epub.cr    | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 wat.cr     |  57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 202 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2eaa65f..0baa38a 100644
--- a/.gitignore
+++ a/.gitignore
@@ -16,3 +16,5 @@
wat/*.opus
wat/*.json
!books/wat-audio.txt
wat.epub
wat/
diff --git a/epub.cr b/epub.cr
new file mode 100644
index 0000000..a93c1f3 100644
--- /dev/null
+++ a/epub.cr
@@ -1,0 +1,154 @@
require "compress/zip"
require "http/client"

class EPUBGenerator
  def initialize(input_dir : String, output_file : String)
    @input_dir = input_dir
    @output_file = output_file
    @images_dir = File.join(@input_dir, "images")
    Dir.mkdir_p(@images_dir)

    @metadata = <<-XML

    <dc:identifier id="epub-id-1">1250319188</dc:identifier>
    <dc:identifier id="epub-id-2">978-1250319180</dc:identifier>
    <dc:title id="epub-title-1">Wind and Truth: Book Five of the Stormlight Archive</dc:title>
    <dc:date>2024-12-06</dc:date>
    <dc:language>en-US</dc:language>
    <dc:creator id="epub-creator-1">Brandon Sanderson</dc:creator>
    <meta property="dcterms:modified">2024-11-03T15:18:10Z</meta>
    XML

    @html_files = Dir.glob(File.join(@input_dir, "*.html")).sort
    @title = "Wind and Truth: Book Five of the Stormlight Archive"
  end

  private def parse_metadata(xml : String) : Hash(String, String)
    metadata = Hash(String, String).new
    doc = XML.parse(xml)

    doc.root.children.each do |child|
      next unless child.is_a?(XML::Element)
      metadata[child.name] = child.text
    end

    metadata
  end

  private def download_images(html_file : String) : String
    html = File.read(html_file)
    updated_html = html.dup
    image_urls = html.scan(/<img\s[^>]*src=["'](https?:\/\/[^"']+)["']/).map(&.first)

    image_urls.each_with_index do |url, index|
      file_ext = File.extname(URI.parse(url).path)
      local_file = File.join(@images_dir, "image_#{index + 1}#{file_ext}")

      # Download image
      HTTP::Client.get(url) do |response|
        File.open(local_file, "w") { |f| f.write(response.body_io) }
      end

      # Replace URL in HTML
      updated_html.gsub!(url, "images/#{File.basename(local_file)}")
    end

    updated_html
  end

  private def create_container
    <<-XML

    <?xml version="1.0" encoding="UTF-8"?>
    <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
      <rootfiles>
        <rootfile full-path="content.opf" media-type="application/oebps-package+xml"/>
      </rootfiles>
    </container>
    XML
  end

  private def generate_content_opf
    manifest = @html_files.map_with_index do |file, index|
      <<-XML

      <item id="chapter#{index + 1}" href="chapters/#{File.basename(file)}" media-type="application/xhtml+xml"/>
      XML
    end.join

    spine = @html_files.map_with_index do |_, index|
      <<-XML

      <itemref idref="chapter#{index + 1}"/>
      XML
    end.join

    <<-XML

    <?xml version="1.0" encoding="UTF-8"?>
    <package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="epub-id-1" prefix="ibooks: http://vocabulary.itunes.apple.com/rdf/ibooks/vocabulary-extensions-1.0/">
    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
    #{@metadata}
    </metadata>
    <manifest>
    #{manifest}
    </manifest>
    <spine>
    #{spine}
    </spine>
    </package>
    XML
  end

  private def generate_toc
    toc_items = @html_files.map_with_index do |file, index|
      <<-HTML

      <li><a href="chapters/#{File.basename(file)}">Chapter #{index + 1}</a></li>
      HTML
    end.join

    <<-HTML

    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE html>
    <html xmlns="http://www.w3.org/1999/xhtml">
      <head>
        <title>Table of Contents</title>
      </head>
      <body>
        <nav epub:type="toc" id="toc">
          <h1>#{@title}</h1>
          <ol>
            #{toc_items}
          </ol>
        </nav>
      </body>
    </html>
    HTML
  end

  def generate
    File.open(@output_file, "w") do |file|
      Compress::Zip::Writer.open(file) do |zip|
        # ERROR(PKG-007): ./wat.epub(-1,-1): Mimetype file should only contain
        # the string "application/epub+zip" and should not be compressed.
        entry = Compress::Zip::Writer::Entry.new("mimetype")
        entry.compression_method = Compress::Zip::CompressionMethod::STORED
        entry.compressed_size = 20_u32
        entry.uncompressed_size = 20_u32
        entry.crc32 = 749429103_u32
        zip.add(entry) do |io|
          io.write_string("application/epub+zip".to_slice)
          io.close
        end

        zip.add("META-INF/container.xml", create_container)
        zip.add("content.opf", generate_content_opf)
        zip.add("toc.xhtml", generate_toc)

        @html_files.each do |chapter|
          puts chapter
          zip.add("chapters/#{File.basename(chapter)}", File.open(chapter))
        end

        Dir.glob("images/*").each do |image|
          zip.add("images/#{File.basename(image)}", File.open(image))
        end
      end
    end
  end
end
diff --git a/wat.cr b/wat.cr
index 6b55e9b..4902d74 100644
--- a/wat.cr
+++ a/wat.cr
@@ -1,7 +1,9 @@
require "http/client"
require "dir"
require "lexbor"
Dir.mkdir_p("wat")
require "./epub"

Dir.mkdir_p("wat/chapters")
BASE = "https://reactormag.com/read-wind-and-truth-by-brandon-sanderson-"

LINKS = [
@@ -23,7 +25,7 @@
  "chapters-29-and-30/",
  "chapters-31-and-32/",
  "chapter-33/",
  "interludes-3-and-4",
  "interludes-3-and-4/",
]

# Automatically adds all recent chapter
@@ -49,30 +51,63 @@

# # Now we have all the files
html = ""

(1..(LINKS.size)).each do |i|
  page_html = File.open("wat/#{i}.html").gets_to_end
  page = Lexbor::Parser.new(page_html).css("article-content")
  start = ending = false
  start = false

  page.first.children.each do |e|
    # puts e.tag_name
    if e.tag_name == "h3"
      # Create a new Lexbor H1 node instead
      e = Lexbor::Parser.new("<h1>#{e.inner_html}</h1>").root!
      e2 = Lexbor::Parser.new("<h1>#{e.inner_html}</h1>").nodes(:h1).first
      e2.inner_html = e.inner_html
      e = e2
    end

    if e.tag_name == "h1" || e.tag_name == "h3"
      start = true
    end

    # Chapter Arch heading images
    if e.tag_name == "figure" && e["class"].includes?("wp-block-image")
      start = true
    end

    ending = true if e.tag_text.includes?("Excerpted") && start
    if start && e.tag_text.includes?("Excerpted")
      break
    elsif start
      html += e.to_html
    end
  end
end

    e.remove! if !start || ending
file_chapter_index = 1
split_html = ""
Lexbor::Parser.new(html).nodes(:body).first.children.each do |ee|
  if ee.tag_name == "figure" && ee["class"].includes?("wp-block-image") && split_html != ""
    File.write("wat/chapters/#{file_chapter_index}.html", <<-XHTML

    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE html>
    <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
    <head>
      <meta charset="utf-8" />
      <meta name="generator" content="github/captn3m0/cosmere-ebooks" />
      <title>Untitled</title>
    </head>
    <body epub:type="bodymatter">
    <section id="section" class="level1 unnumbered">
    #{split_html}
    </section>
    </body>
    </html>
    XHTML
    )
    split_html = ""
    file_chapter_index += 1
  end
  chapter_html = page.first.inner_html
    .sub(/<h1/, "<h1 id='chapter-#{i - 1}'")
  html += chapter_html
  split_html += ee.to_html
end

File.write("books/wat2.html", html)
generator = EPUBGenerator.new("wat/chapters", "wat.epub")
generator.generate