2021-05-26 13:54:36 +00:00
|
|
|
import os
|
2021-06-27 12:03:49 +00:00
|
|
|
import logging
|
|
|
|
import shutil
|
|
|
|
import tempfile
|
|
|
|
import urllib.request
|
|
|
|
import validators
|
|
|
|
|
2021-05-26 13:54:36 +00:00
|
|
|
import html5lib
|
2021-06-27 12:03:49 +00:00
|
|
|
import markdown
|
|
|
|
|
2024-08-12 10:58:04 +00:00
|
|
|
from pypdf import PdfWriter, PdfReader
|
|
|
|
from pypdf.generic import Fit
|
2021-05-28 15:59:02 +00:00
|
|
|
from pystitcher import __version__
|
2021-06-27 12:03:49 +00:00
|
|
|
from .bookmark import Bookmark
|
2021-05-26 13:54:36 +00:00
|
|
|
|
2021-05-26 14:43:59 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2021-05-26 13:54:36 +00:00
|
|
|
|
|
|
|
""" Main Stitcher class """
|
|
|
|
class Stitcher:
|
|
|
|
def __init__(self, inputBuffer):
|
|
|
|
self.files = []
|
|
|
|
self.currentPage = 1
|
|
|
|
self.title = None
|
|
|
|
self.bookmarks = []
|
2021-05-28 20:48:43 +00:00
|
|
|
self.currentLevel = 0
|
2021-05-26 13:54:36 +00:00
|
|
|
self.oldBookmarks = []
|
|
|
|
self.dir = os.path.dirname(os.path.abspath(inputBuffer.name))
|
2021-05-28 17:11:57 +00:00
|
|
|
# Fit complete page width by default
|
|
|
|
DEFAULT_FIT = '/FitV'
|
2021-06-25 06:41:24 +00:00
|
|
|
# Do not rotate by default
|
|
|
|
DEFAULT_ROTATE = 0
|
2021-06-26 17:26:38 +00:00
|
|
|
# Start at page 1 by default
|
|
|
|
DEFAULT_START = 1
|
2021-06-26 18:33:57 +00:00
|
|
|
# End at the final page by default
|
|
|
|
DEFAULT_END = None
|
2021-05-28 17:11:57 +00:00
|
|
|
|
2021-05-26 16:25:51 +00:00
|
|
|
# TODO: This is a hack
|
2021-05-26 13:54:36 +00:00
|
|
|
os.chdir(self.dir)
|
|
|
|
|
|
|
|
text = inputBuffer.read()
|
2021-05-26 16:14:26 +00:00
|
|
|
md = markdown.Markdown(extensions=['attr_list', 'meta'])
|
|
|
|
html = md.convert(text)
|
|
|
|
self.attributes = md.Meta
|
2021-05-28 17:11:57 +00:00
|
|
|
self.defaultFit = self._getAttribute('fit', DEFAULT_FIT)
|
2021-06-25 06:41:24 +00:00
|
|
|
self.defaultRotate = self._getAttribute('rotate', DEFAULT_ROTATE)
|
2021-06-26 17:26:38 +00:00
|
|
|
self.defaultStart = self._getAttribute('start', DEFAULT_START)
|
|
|
|
self.defaultEnd = self._getAttribute('end', DEFAULT_END)
|
2021-05-26 16:14:26 +00:00
|
|
|
|
2021-05-26 13:54:36 +00:00
|
|
|
document = html5lib.parseFragment(html, namespaceHTMLElements=False)
|
|
|
|
for e in document.iter():
|
|
|
|
self.iter(e)
|
|
|
|
|
2021-06-27 12:03:49 +00:00
|
|
|
"""
|
|
|
|
Check if file has been cached locally and if
|
|
|
|
not cached, download from provided URL. Return
|
|
|
|
download filename
|
|
|
|
"""
|
|
|
|
def _cacheURL(self, url):
|
|
|
|
if not os.path.exists(os.path.basename(url)):
|
|
|
|
_logger.info("Downloading PDF from remote URL %s", url)
|
|
|
|
with urllib.request.urlopen(url) as response, open(os.path.basename(url), 'wb') as downloadedFile:
|
|
|
|
shutil.copyfileobj(response, downloadedFile)
|
|
|
|
else:
|
2021-06-27 12:13:09 +00:00
|
|
|
_logger.info("Locally cached PDF found at %s", os.path.basename(url))
|
2021-06-27 12:03:49 +00:00
|
|
|
return os.path.basename(url)
|
|
|
|
|
2021-05-28 15:59:02 +00:00
|
|
|
"""
|
|
|
|
Get the number of pages in a PDF file
|
|
|
|
"""
|
2021-05-26 13:54:36 +00:00
|
|
|
def _get_pdf_number_of_pages(self, filename):
|
|
|
|
assert os.path.isfile(filename) and os.access(filename, os.R_OK), \
|
|
|
|
"File {} doesn't exist or isn't readable".format(filename)
|
2024-08-12 10:58:04 +00:00
|
|
|
pdf_reader = PdfReader(open(filename, "rb"))
|
|
|
|
return pdf_reader.get_num_pages()
|
2021-05-26 13:54:36 +00:00
|
|
|
|
2021-05-28 15:59:02 +00:00
|
|
|
"""
|
|
|
|
Return an attribute with a default value of None
|
|
|
|
"""
|
2021-05-28 17:11:57 +00:00
|
|
|
def _getAttribute(self, key, default=None):
|
|
|
|
return self.attributes.get(key, [default])[0]
|
2021-05-26 16:25:51 +00:00
|
|
|
|
2021-05-28 15:59:02 +00:00
|
|
|
def _getMetadata(self):
|
|
|
|
meta = {'/Producer': "pystitcher/%s" % __version__, '/Creator': "pystitcher/%s" % __version__}
|
|
|
|
if (self._getAttribute('author')):
|
|
|
|
meta["/Author"] = self._getAttribute('author')
|
|
|
|
if (self._getAttribute('title')):
|
|
|
|
meta["/Title"] = self._getAttribute('title')
|
|
|
|
elif self.title:
|
|
|
|
meta["/Title"] = self.title
|
|
|
|
if (self._getAttribute('subject')):
|
|
|
|
meta["/Subject"] = self._getAttribute('subject')
|
|
|
|
if (self._getAttribute('keywords')):
|
|
|
|
meta["/Keywords"] = self._getAttribute('keywords')
|
|
|
|
|
|
|
|
return meta
|
|
|
|
|
2021-05-28 17:11:57 +00:00
|
|
|
"""
|
|
|
|
Iterate through the elements in the spine HTML
|
|
|
|
and generate self.bookmarks + self.files
|
|
|
|
"""
|
2021-05-26 13:54:36 +00:00
|
|
|
def iter(self, element):
|
|
|
|
tag = element.tag
|
|
|
|
b = None
|
|
|
|
if(tag=='h1'):
|
|
|
|
if (self.title == None):
|
|
|
|
self.title = element.text
|
2021-05-28 17:11:57 +00:00
|
|
|
fit = element.attrib.get('fit', self.defaultFit)
|
|
|
|
b = Bookmark(self.currentPage, element.text, 1, fit)
|
2021-05-26 13:54:36 +00:00
|
|
|
self.currentLevel = 1
|
|
|
|
elif(tag=='h2'):
|
2021-05-28 17:11:57 +00:00
|
|
|
fit = element.attrib.get('fit', self.defaultFit)
|
|
|
|
b = Bookmark(self.currentPage, element.text, 2, fit)
|
2021-05-26 13:54:36 +00:00
|
|
|
self.currentLevel = 2
|
|
|
|
elif(tag =='h3'):
|
2021-05-28 17:11:57 +00:00
|
|
|
fit = element.attrib.get('fit', self.defaultFit)
|
|
|
|
b = Bookmark(self.currentPage, element.text, 3, fit)
|
2021-05-26 13:54:36 +00:00
|
|
|
self.currentLevel = 3
|
|
|
|
elif(tag =='a'):
|
|
|
|
file = element.attrib.get('href')
|
2021-06-27 12:03:49 +00:00
|
|
|
if(validators.url(file)):
|
|
|
|
file = self._cacheURL(file)
|
2021-05-28 17:11:57 +00:00
|
|
|
fit = element.attrib.get('fit', self.defaultFit)
|
2021-06-26 17:26:38 +00:00
|
|
|
rotate = int(element.attrib.get('rotate', self.defaultRotate))
|
|
|
|
start = int(element.attrib.get('start', self.defaultStart))
|
2021-06-27 12:03:49 +00:00
|
|
|
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file)
|
|
|
|
if self.defaultEnd is None else self.defaultEnd))
|
2021-06-26 17:26:38 +00:00
|
|
|
filters = (rotate, start, end)
|
2021-05-28 17:11:57 +00:00
|
|
|
b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
|
2021-06-26 17:26:38 +00:00
|
|
|
self.files.append((file, self.currentPage, filters))
|
|
|
|
self.currentPage += (end - start) + 1
|
2021-05-26 13:54:36 +00:00
|
|
|
if b:
|
|
|
|
self.bookmarks.append(b)
|
|
|
|
|
2021-05-26 16:14:26 +00:00
|
|
|
def _existingBookmarkConfig(self):
|
2021-05-28 17:34:50 +00:00
|
|
|
EXISTING_BOOKMARKS_DEFAULT = 'remove'
|
|
|
|
return self._getAttribute('existing_bookmarks', EXISTING_BOOKMARKS_DEFAULT)
|
2021-05-26 16:14:26 +00:00
|
|
|
|
2021-05-26 16:25:51 +00:00
|
|
|
def _removeExistingBookmarks(self):
|
|
|
|
return (self._existingBookmarkConfig() == 'remove')
|
2021-05-26 16:14:26 +00:00
|
|
|
|
|
|
|
def _flattenBookmarks(self):
|
2021-05-26 16:25:51 +00:00
|
|
|
return (self._existingBookmarkConfig() == 'flatten')
|
2021-05-26 16:14:26 +00:00
|
|
|
|
2021-05-28 15:45:51 +00:00
|
|
|
"""
|
2021-06-25 06:41:24 +00:00
|
|
|
Adds the existing bookmarks into the
|
2021-05-28 15:45:51 +00:00
|
|
|
self.bookmarks list
|
|
|
|
"""
|
|
|
|
def _add_existing_bookmarks(self):
|
|
|
|
self.bookmarks.sort()
|
|
|
|
|
|
|
|
bookmarks = self.bookmarks.copy()
|
2021-05-26 13:54:36 +00:00
|
|
|
|
2021-05-28 15:45:51 +00:00
|
|
|
if (self._removeExistingBookmarks() != True):
|
|
|
|
for b in self.oldBookmarks:
|
|
|
|
outer_level = self._get_level_from_page_number(b.page+1)
|
|
|
|
if (self._flattenBookmarks()):
|
|
|
|
increment = 2
|
|
|
|
else:
|
|
|
|
increment = b.level
|
|
|
|
level = outer_level + increment - 1
|
2021-05-28 17:11:57 +00:00
|
|
|
bookmarks.append(Bookmark(b.page+1, b.title, level, b.fit))
|
2021-05-28 15:45:51 +00:00
|
|
|
|
|
|
|
bookmarks.sort()
|
|
|
|
self.bookmarks = bookmarks
|
|
|
|
|
|
|
|
"""
|
2021-06-26 17:26:38 +00:00
|
|
|
Gets the last bookmark level at a given page number
|
2021-05-28 15:45:51 +00:00
|
|
|
on the combined PDF
|
|
|
|
"""
|
2021-05-26 13:54:36 +00:00
|
|
|
def _get_level_from_page_number(self, page):
|
2021-05-28 13:22:57 +00:00
|
|
|
previousBookmarkLevel = self.bookmarks[0].level
|
2021-05-26 13:54:36 +00:00
|
|
|
for b in self.bookmarks:
|
2021-05-28 13:22:57 +00:00
|
|
|
# _logger.info("testing: %s (P%s) [L%s]", b.title, b.page, b.level)
|
2021-05-26 16:47:24 +00:00
|
|
|
if (b.page > page):
|
2021-05-28 13:22:57 +00:00
|
|
|
# _logger.info("Returning L%s", previousBookmarkLevel)
|
|
|
|
return previousBookmarkLevel
|
|
|
|
previousBookmarkLevel = b.level
|
|
|
|
return previousBookmarkLevel
|
2021-05-26 13:54:36 +00:00
|
|
|
|
2021-05-28 15:45:51 +00:00
|
|
|
"""
|
|
|
|
Recursive method to read the old bookmarks (which are nested)
|
|
|
|
and push them to self.oldBookmarks
|
|
|
|
"""
|
2021-05-26 15:26:18 +00:00
|
|
|
def _iterate_old_bookmarks(self, pdf, startPage, bookmarks, level = 1):
|
2021-05-26 13:54:36 +00:00
|
|
|
if (isinstance(bookmarks, list)):
|
|
|
|
for inner_bookmark in bookmarks:
|
2021-05-26 15:26:18 +00:00
|
|
|
self._iterate_old_bookmarks(pdf, startPage, inner_bookmark, level+1)
|
2021-05-26 13:54:36 +00:00
|
|
|
else:
|
2024-08-12 10:58:04 +00:00
|
|
|
localPageNumber = pdf.get_destination_page_number(bookmarks)
|
2021-05-26 16:25:51 +00:00
|
|
|
globalPageNumber = startPage + localPageNumber - 1
|
2021-05-28 17:11:57 +00:00
|
|
|
b = Bookmark(globalPageNumber, bookmarks.title, level, self.defaultFit)
|
2021-05-26 13:54:36 +00:00
|
|
|
self.oldBookmarks.append(b)
|
|
|
|
|
2021-05-28 15:05:48 +00:00
|
|
|
"""
|
|
|
|
Insert the bookmarks into the PDF file
|
|
|
|
Ref: https://stackoverflow.com/a/18867646
|
2021-05-28 15:45:51 +00:00
|
|
|
# TODO: Interleave this into the merge method somehow
|
2021-05-28 15:05:48 +00:00
|
|
|
"""
|
2021-05-28 17:11:57 +00:00
|
|
|
def _insert_bookmarks(self, old_filename, outputFilename):
|
2021-05-28 15:05:48 +00:00
|
|
|
stack = []
|
2024-08-12 10:58:04 +00:00
|
|
|
pdfInput = PdfReader(open(old_filename, 'rb'))
|
|
|
|
pdfOutput = PdfWriter()
|
|
|
|
pdfOutput.clone_document_from_reader(pdfInput)
|
2021-05-28 15:05:48 +00:00
|
|
|
for b in self.bookmarks:
|
|
|
|
existingRef = None
|
|
|
|
# Trim the stack till the top is useful (stack.level < b.level)
|
|
|
|
while len(stack) > 0 and stack[len(stack)-1][0].level >= b.level:
|
|
|
|
stack.pop()
|
|
|
|
# If stack has something, use it
|
|
|
|
if (len(stack) > 0):
|
2021-05-28 17:11:57 +00:00
|
|
|
existingRef = stack[len(stack) - 1][1]
|
2024-08-12 10:58:04 +00:00
|
|
|
bookmargArgs = [b.title, b.page-1, existingRef, None, False, False, Fit(b.fit)] + b.cords
|
|
|
|
stack.append((b, pdfOutput.add_outline_item(*bookmargArgs)))
|
|
|
|
pdfOutput.add_metadata(self._getMetadata())
|
2021-05-28 15:05:48 +00:00
|
|
|
pdfOutput.write(open(outputFilename, 'wb'))
|
2021-05-26 13:54:36 +00:00
|
|
|
|
2021-05-28 15:45:51 +00:00
|
|
|
"""
|
|
|
|
Merge the PDF files together in order
|
|
|
|
and iterate through the old bookmarks
|
|
|
|
as we're reading them
|
|
|
|
"""
|
2021-05-26 15:26:18 +00:00
|
|
|
def _merge(self, output):
|
2024-08-12 10:58:04 +00:00
|
|
|
writer = PdfWriter()
|
2021-06-26 17:26:38 +00:00
|
|
|
for (inputFile,startPage,filters) in self.files:
|
2021-05-26 13:54:36 +00:00
|
|
|
assert os.path.isfile(inputFile), ERROR_PATH.format(inputFile)
|
2024-08-12 10:58:04 +00:00
|
|
|
reader = PdfReader(open(inputFile, 'rb'))
|
2021-05-28 15:45:51 +00:00
|
|
|
# Recursively iterate through the old bookmarks
|
2024-08-12 10:58:04 +00:00
|
|
|
self._iterate_old_bookmarks(reader, startPage, reader.outline)
|
2021-06-26 17:26:38 +00:00
|
|
|
rotate, start, end = filters
|
|
|
|
for page in range(start, end + 1):
|
2024-08-12 10:58:04 +00:00
|
|
|
writer.add_page(reader.get_page(page - 1).rotate(rotate))
|
2021-06-25 06:41:24 +00:00
|
|
|
|
2021-05-26 14:43:59 +00:00
|
|
|
writer.write(output)
|
|
|
|
output.close()
|
2021-05-26 13:54:36 +00:00
|
|
|
|
2021-05-28 15:45:51 +00:00
|
|
|
"""
|
|
|
|
Main entrypoint to generate the final PDF
|
|
|
|
"""
|
2021-05-26 14:43:59 +00:00
|
|
|
def generate(self, outputFilename, cleanup = False):
|
|
|
|
tempPdf = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
2021-05-26 15:26:18 +00:00
|
|
|
self._merge(tempPdf)
|
2021-05-28 15:45:51 +00:00
|
|
|
# Only read the additional bookmarks if we're not removing them
|
2021-05-28 15:59:02 +00:00
|
|
|
if (not self._removeExistingBookmarks()):
|
2021-05-28 15:45:51 +00:00
|
|
|
self._add_existing_bookmarks()
|
2021-05-28 17:11:57 +00:00
|
|
|
self._insert_bookmarks(tempPdf.name, outputFilename)
|
2021-05-26 13:54:36 +00:00
|
|
|
|
2021-05-26 14:43:59 +00:00
|
|
|
if (cleanup):
|
|
|
|
_logger.info("Deleting temporary files")
|
|
|
|
os.remove(tempPdf.name)
|
2021-05-26 14:58:15 +00:00
|
|
|
else:
|
2021-05-28 15:45:51 +00:00
|
|
|
# Why print? Because this is not logging, this is output
|
2021-05-28 15:59:02 +00:00
|
|
|
print("Temporary PDF file saved as ", tempPdf.name)
|