From b4449daa500eee95123d290634ad7f4ef9d8bd79 Mon Sep 17 00:00:00 2001 From: Nemo Date: Wed, 26 May 2021 19:24:36 +0530 Subject: [PATCH] Functionally running, but only for me --- .editorconfig | 19 ++++++ setup.cfg | 4 +- src/pystitcher/bookmark.py | 12 ++++ src/pystitcher/skeleton.py | 76 ++++----------------- src/pystitcher/stitcher.py | 132 +++++++++++++++++++++++++++++++++++++ 5 files changed, 179 insertions(+), 64 deletions(-) create mode 100644 .editorconfig create mode 100644 src/pystitcher/bookmark.py create mode 100644 src/pystitcher/stitcher.py diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..b4b6227 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,19 @@ +# EditorConfig is awesome: http://EditorConfig.org + +# top-most EditorConfig file +root = true + +# Unix-style newlines with a newline ending every file +[*] +end_of_line = lf +insert_final_newline = true + +# 4 space indentation +[*.py] +indent_style = space +indent_size = 4 + +# Matches the exact files either package.json or .travis.yml +[{package.json,.travis.yml}] +indent_style = space +indent_size = 2 diff --git a/setup.cfg b/setup.cfg index d4fec6b..28680a7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,8 +67,8 @@ testing = # console_scripts = # script_name = pystitcher.module:function # For example: -# console_scripts = -# fibonacci = pystitcher.skeleton:run +console_scripts = + pystitcher = pystitcher.skeleton:run # And any other entry points, for example: # pyscaffold.cli = # awesome = pyscaffoldext.awesome.extension:AwesomeExtension diff --git a/src/pystitcher/bookmark.py b/src/pystitcher/bookmark.py new file mode 100644 index 0000000..bdd3b35 --- /dev/null +++ b/src/pystitcher/bookmark.py @@ -0,0 +1,12 @@ +""" Bookmark class """ +class Bookmark: + def __init__(self, page, title, level=1): + self.page = page + self.title = title + self.level = level + + def __lt__(self, other): + return self.page < other.page + + def __repr__(self): + return str([self.page, self.title, self.level]) \ No newline at end of file diff --git a/src/pystitcher/skeleton.py b/src/pystitcher/skeleton.py index 6c07261..4c5d1b4 100644 --- a/src/pystitcher/skeleton.py +++ b/src/pystitcher/skeleton.py @@ -1,29 +1,14 @@ """ -This is a skeleton file that can serve as a starting point for a Python -console script. To run this script uncomment the following lines in the -``[options.entry_points]`` section in ``setup.cfg``:: - - console_scripts = - fibonacci = pystitcher.skeleton:run - -Then run ``pip install .`` (or ``pip install -e .`` for editable mode) -which will install the command ``fibonacci`` inside your current environment. - -Besides console scripts, the header (i.e. until ``_logger``...) of this file can -also be used as template for Python modules. - -Note: - This skeleton file can be safely removed if not needed! +This is the entry script References: - https://setuptools.readthedocs.io/en/latest/userguide/entry_point.html - - https://pip.pypa.io/en/stable/reference/pip_install """ import argparse import logging import sys - +from .stitcher import Stitcher from pystitcher import __version__ __author__ = "Nemo" @@ -33,29 +18,6 @@ __license__ = "MIT" _logger = logging.getLogger(__name__) -# ---- Python API ---- -# The functions defined in this section can be imported by users in their -# Python scripts/interactive interpreter, e.g. via -# `from pystitcher.skeleton import fib`, -# when using this Python module as a library. - - -def fib(n): - """Fibonacci example function - - Args: - n (int): integer - - Returns: - int: n-th Fibonacci number - """ - assert n > 0 - a, b = 1, 1 - for i in range(n - 1): - a, b = b, a + b - return a - - # ---- CLI ---- # The functions defined in this section are wrappers around the main Python # API allowing them to be called directly from the terminal as a CLI @@ -72,28 +34,28 @@ def parse_args(args): Returns: :obj:`argparse.Namespace`: command line parameters namespace """ - parser = argparse.ArgumentParser(description="Just a Fibonacci demonstration") + parser = argparse.ArgumentParser(description="Stitch PDF files together") parser.add_argument( "--version", action="version", version="pystitcher {ver}".format(ver=__version__), ) - parser.add_argument(dest="n", help="n-th Fibonacci number", type=int, metavar="INT") + parser.add_argument(dest="input", help="Input Spine markdown file", type=argparse.FileType('r', encoding='UTF-8'), metavar="spine.md") + parser.add_argument(dest="output", help="Output PDF file", type=str, metavar="output.pdf") parser.add_argument( "-v", "--verbose", dest="loglevel", - help="set loglevel to INFO", + help="log more things", action="store_const", const=logging.INFO, ) + parser.add_argument( - "-vv", - "--very-verbose", - dest="loglevel", - help="set loglevel to DEBUG", - action="store_const", - const=logging.DEBUG, + "--no-cleanup", + dest="no_cleanup", + help="log more things", + action=argparse.BooleanOptionalAction, ) return parser.parse_args(args) @@ -111,26 +73,16 @@ def setup_logging(loglevel): def main(args): - """Wrapper allowing :func:`fib` to be called with string arguments in a CLI fashion - - Instead of returning the value from :func:`fib`, it prints the result to the - ``stdout`` in a nicely formatted message. - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--verbose", "42"]``). + """Main CLI function """ args = parse_args(args) setup_logging(args.loglevel) - _logger.debug("Starting crazy calculations...") - print("The {}-th Fibonacci number is {}".format(args.n, fib(args.n))) _logger.info("Script ends here") - + stitcher = Stitcher(args.input) + stitcher.generate(args.output, not args.no_cleanup) def run(): """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv` - - This function can be used as entry point to create console scripts with setuptools. """ main(sys.argv[1:]) diff --git a/src/pystitcher/stitcher.py b/src/pystitcher/stitcher.py new file mode 100644 index 0000000..27d8419 --- /dev/null +++ b/src/pystitcher/stitcher.py @@ -0,0 +1,132 @@ +import os +import markdown +from .bookmark import Bookmark +import html5lib +from PyPDF2 import PdfFileWriter, PdfFileReader +import subprocess + + +""" Main Stitcher class """ +class Stitcher: + def __init__(self, inputBuffer): + self.files = [] + self.currentPage = 1 + self.title = None + self.bookmarks = [] + self.currentLevel = None + self.oldBookmarks = [] + self.dir = os.path.dirname(os.path.abspath(inputBuffer.name)) + os.chdir(self.dir) + + text = inputBuffer.read() + html = markdown.markdown(text,extensions=['attr_list']) + document = html5lib.parseFragment(html, namespaceHTMLElements=False) + for e in document.iter(): + self.iter(e) + + def _get_pdf_number_of_pages(self, filename): + assert os.path.isfile(filename) and os.access(filename, os.R_OK), \ + "File {} doesn't exist or isn't readable".format(filename) + pdf_reader = PdfFileReader(open(filename, "rb")) + return pdf_reader.numPages + + def iter(self, element): + tag = element.tag + b = None + if(tag=='h1'): + if (self.title == None): + self.title = element.text + b = Bookmark(self.currentPage, element.text, 1) + self.currentLevel = 1 + elif(tag=='h2'): + b = Bookmark(self.currentPage, element.text, 2) + self.currentLevel = 2 + elif(tag =='h3'): + b = Bookmark(self.currentPage, element.text, 3) + self.currentLevel = 3 + elif(tag =='a'): + file = element.attrib.get('href') + b = Bookmark(self.currentPage, element.text, self.currentLevel+1) + self.currentPage += self._get_pdf_number_of_pages(file) + self.files.append(file) + if b: + self.bookmarks.append(b) + + def _add_bookmark(self, targetFileHandle, title, level, page): + targetFileHandle.write("BookmarkBegin\n") + targetFileHandle.write("BookmarkTitle: " + title + "\n") + targetFileHandle.write("BookmarkLevel: " + str(level) + "\n") + targetFileHandle.write("BookmarkPageNumber: " + str(page) + "\n") + targetFileHandle.write("BookmarkZoom: FitHeight\n") + + def _generate_metadata(self, filename, flatten_inner_bookmarks=True): + with open(filename, 'w') as target: + if (self.title): + target.write("InfoBegin\n") + target.write("InfoKey: Title\n") + target.write("InfoValue: " + self.title + "\n") + + for b in self.oldBookmarks: + outer_level = self._get_level_from_page_number(b.page) + if (flatten_inner_bookmarks): + increment = 1 + else: + increment = b.level + level = outer_level + increment + self.bookmarks.append(Bookmark(b.page+1, b.title, level)) + + self.bookmarks.sort() + + for b in self.bookmarks: + self._add_bookmark(target, b.title, b.level, b.page) + + def _generate_concat_command(self, temp_filename): + return ["pdftk"] + self.files + ['cat', 'output', temp_filename] + + def _generate_temp_pdf(self, temp_filename): + self._merge(self.files, temp_filename) + self._parse_old_bookmarks(temp_filename) + + def _get_level_from_page_number(self, page): + for b in self.bookmarks: + if (b.page >= page): + return b.level + + def _iterate_old_bookmarks(self, pdf, bookmarks, level = 1): + if (isinstance(bookmarks, list)): + for inner_bookmark in bookmarks: + self._iterate_old_bookmarks(pdf, inner_bookmark, level+1) + else: + pageNumber = pdf.getDestinationPageNumber(bookmarks) + b = Bookmark(pageNumber, bookmarks.title, level) + self.oldBookmarks.append(b) + + def _parse_old_bookmarks(self, filename): + p = PdfFileReader(open(filename, "rb")) + self._iterate_old_bookmarks(p, p.getOutlines()) + + def _update_metadata(self, old_filename, metadata_file, outputBuffer): + subprocess.run(['java', '-jar', 'PDFtkBox.jar', old_filename, "update_info", metadata_file, 'output', outputBuffer]) + + def _merge(self, paths, output): + writer = PdfFileWriter() + for inputFile in paths: + assert os.path.isfile(inputFile), ERROR_PATH.format(inputFile) + reader = PdfFileReader(open(inputFile, 'rb')) + for page in range(1, reader.getNumPages()+1): + writer.addPage(reader.getPage(page - 1)) + + with open(output, 'wb') as stream: + writer.write(stream) + + def generate(self, outputBuffer, delete_temp_files = False): + METADATA_FILENAME = 'metadata.txt' + TEMP_PDF_FILENAME = 'temp.pdf' + + self._generate_temp_pdf(TEMP_PDF_FILENAME) + self._generate_metadata(METADATA_FILENAME) + self._update_metadata(TEMP_PDF_FILENAME, METADATA_FILENAME, outputBuffer) + + if (delete_temp_files): + os.remove(METADATA_FILENAME) + os.remove(TEMP_PDF_FILENAME)