From 35976d1f3206721d94855e6a321cd0cb8937e211 Mon Sep 17 00:00:00 2001 From: Nemo <commits@captnemo.in> Date: Tue, 03 Dec 2024 16:46:20 +0530 Subject: [PATCH] WIP: crystal port --- .gitignore | 2 ++ epub.cr | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ wat.cr | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 202 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 2eaa65f..0baa38a 100644 --- a/.gitignore +++ a/.gitignore @@ -16,3 +16,5 @@ wat/*.opus wat/*.json !books/wat-audio.txt +wat.epub +wat/ diff --git a/epub.cr b/epub.cr new file mode 100644 index 0000000..a93c1f3 100644 --- /dev/null +++ a/epub.cr @@ -1,0 +1,154 @@ +require "compress/zip" +require "http/client" + +class EPUBGenerator + def initialize(input_dir : String, output_file : String) + @input_dir = input_dir + @output_file = output_file + @images_dir = File.join(@input_dir, "images") + Dir.mkdir_p(@images_dir) + + @metadata = <<-XML + <dc:identifier id="epub-id-1">1250319188</dc:identifier> + <dc:identifier id="epub-id-2">978-1250319180</dc:identifier> + <dc:title id="epub-title-1">Wind and Truth: Book Five of the Stormlight Archive</dc:title> + <dc:date>2024-12-06</dc:date> + <dc:language>en-US</dc:language> + <dc:creator id="epub-creator-1">Brandon Sanderson</dc:creator> + <meta property="dcterms:modified">2024-11-03T15:18:10Z</meta> + XML + + @html_files = Dir.glob(File.join(@input_dir, "*.html")).sort + @title = "Wind and Truth: Book Five of the Stormlight Archive" + end + + private def parse_metadata(xml : String) : Hash(String, String) + metadata = Hash(String, String).new + doc = XML.parse(xml) + + doc.root.children.each do |child| + next unless child.is_a?(XML::Element) + metadata[child.name] = child.text + end + + metadata + end + + private def download_images(html_file : String) : String + html = File.read(html_file) + updated_html = html.dup + image_urls = html.scan(/<img\s[^>]*src=["'](https?:\/\/[^"']+)["']/).map(&.first) + + image_urls.each_with_index do |url, index| + file_ext = File.extname(URI.parse(url).path) + local_file = File.join(@images_dir, "image_#{index + 1}#{file_ext}") + + # Download image + HTTP::Client.get(url) do |response| + File.open(local_file, "w") { |f| f.write(response.body_io) } + end + + # Replace URL in HTML + updated_html.gsub!(url, "images/#{File.basename(local_file)}") + end + + updated_html + end + + private def create_container + <<-XML + <?xml version="1.0" encoding="UTF-8"?> + <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> + <rootfiles> + <rootfile full-path="content.opf" media-type="application/oebps-package+xml"/> + </rootfiles> + </container> + XML + end + + private def generate_content_opf + manifest = @html_files.map_with_index do |file, index| + <<-XML + <item id="chapter#{index + 1}" href="chapters/#{File.basename(file)}" media-type="application/xhtml+xml"/> + XML + end.join + + spine = @html_files.map_with_index do |_, index| + <<-XML + <itemref idref="chapter#{index + 1}"/> + XML + end.join + + <<-XML + <?xml version="1.0" encoding="UTF-8"?> + <package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="epub-id-1" prefix="ibooks: http://vocabulary.itunes.apple.com/rdf/ibooks/vocabulary-extensions-1.0/"> + <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> + #{@metadata} + </metadata> + <manifest> + #{manifest} + </manifest> + <spine> + #{spine} + </spine> + </package> + XML + end + + private def generate_toc + toc_items = @html_files.map_with_index do |file, index| + <<-HTML + <li><a href="chapters/#{File.basename(file)}">Chapter #{index + 1}</a></li> + HTML + end.join + + <<-HTML + <?xml version="1.0" encoding="UTF-8"?> + <!DOCTYPE html> + <html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <title>Table of Contents</title> + </head> + <body> + <nav epub:type="toc" id="toc"> + <h1>#{@title}</h1> + <ol> + #{toc_items} + </ol> + </nav> + </body> + </html> + HTML + end + + def generate + File.open(@output_file, "w") do |file| + Compress::Zip::Writer.open(file) do |zip| + # ERROR(PKG-007): ./wat.epub(-1,-1): Mimetype file should only contain + # the string "application/epub+zip" and should not be compressed. + entry = Compress::Zip::Writer::Entry.new("mimetype") + entry.compression_method = Compress::Zip::CompressionMethod::STORED + entry.compressed_size = 20_u32 + entry.uncompressed_size = 20_u32 + entry.crc32 = 749429103_u32 + zip.add(entry) do |io| + io.write_string("application/epub+zip".to_slice) + io.close + end + + zip.add("META-INF/container.xml", create_container) + zip.add("content.opf", generate_content_opf) + zip.add("toc.xhtml", generate_toc) + + @html_files.each do |chapter| + puts chapter + zip.add("chapters/#{File.basename(chapter)}", File.open(chapter)) + end + + Dir.glob("images/*").each do |image| + zip.add("images/#{File.basename(image)}", File.open(image)) + end + end + end + end +end diff --git a/wat.cr b/wat.cr index 6b55e9b..4902d74 100644 --- a/wat.cr +++ a/wat.cr @@ -1,7 +1,9 @@ require "http/client" require "dir" require "lexbor" -Dir.mkdir_p("wat") +require "./epub" + +Dir.mkdir_p("wat/chapters") BASE = "https://reactormag.com/read-wind-and-truth-by-brandon-sanderson-" LINKS = [ @@ -23,7 +25,7 @@ "chapters-29-and-30/", "chapters-31-and-32/", "chapter-33/", - "interludes-3-and-4", + "interludes-3-and-4/", ] # Automatically adds all recent chapter @@ -49,30 +51,63 @@ # # Now we have all the files html = "" + (1..(LINKS.size)).each do |i| page_html = File.open("wat/#{i}.html").gets_to_end page = Lexbor::Parser.new(page_html).css("article-content") - start = ending = false + start = false page.first.children.each do |e| + # puts e.tag_name if e.tag_name == "h3" - # Create a new Lexbor H1 node instead - e = Lexbor::Parser.new("<h1>#{e.inner_html}</h1>").root! + e2 = Lexbor::Parser.new("<h1>#{e.inner_html}</h1>").nodes(:h1).first + e2.inner_html = e.inner_html + e = e2 + end + if e.tag_name == "h1" || e.tag_name == "h3" start = true end + # Chapter Arch heading images if e.tag_name == "figure" && e["class"].includes?("wp-block-image") start = true end - ending = true if e.tag_text.includes?("Excerpted") && start + if start && e.tag_text.includes?("Excerpted") + break + elsif start + html += e.to_html + end + end +end - e.remove! if !start || ending +file_chapter_index = 1 +split_html = "" +Lexbor::Parser.new(html).nodes(:body).first.children.each do |ee| + if ee.tag_name == "figure" && ee["class"].includes?("wp-block-image") && split_html != "" + File.write("wat/chapters/#{file_chapter_index}.html", <<-XHTML + <?xml version="1.0" encoding="UTF-8"?> + <!DOCTYPE html> + <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> + <head> + <meta charset="utf-8" /> + <meta name="generator" content="github/captn3m0/cosmere-ebooks" /> + <title>Untitled</title> + </head> + <body epub:type="bodymatter"> + <section id="section" class="level1 unnumbered"> + #{split_html} + </section> + </body> + </html> + XHTML + ) + split_html = "" + file_chapter_index += 1 end - chapter_html = page.first.inner_html - .sub(/<h1/, "<h1 id='chapter-#{i - 1}'") - html += chapter_html + split_html += ee.to_html end -File.write("books/wat2.html", html) +generator = EPUBGenerator.new("wat/chapters", "wat.epub") +generator.generate -- rgit 0.1.5