mirror of https://github.com/captn3m0/pystitcher
Merge pull request #11 from captn3m0/feature/external_url
Add basic implementation of external URL fetching of PDFspull/18/head
commit
af4752bee1
|
@ -41,11 +41,13 @@ python_requires = >=3.6
|
|||
# PyPDF3: Read and write PDF files
|
||||
# Markdown: Render input markdown file to HTML
|
||||
# html5: Parse HTML file to generate bookmarks
|
||||
# validators: Validate URL for fetching external PDF
|
||||
install_requires =
|
||||
importlib-metadata; python_version<"3.8"
|
||||
PyPDF3>=1.0.4
|
||||
Markdown>=3.3.4
|
||||
html5>=0.0.9
|
||||
validators>=0.18.1
|
||||
|
||||
[options.packages.find]
|
||||
where = src
|
||||
|
|
|
@ -1,12 +1,17 @@
|
|||
import os
|
||||
import markdown
|
||||
from .bookmark import Bookmark
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
import urllib.request
|
||||
import validators
|
||||
|
||||
import html5lib
|
||||
import markdown
|
||||
|
||||
from PyPDF3 import PdfFileWriter, PdfFileReader
|
||||
from PyPDF3.generic import FloatObject
|
||||
from pystitcher import __version__
|
||||
import tempfile
|
||||
import logging
|
||||
from .bookmark import Bookmark
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -45,6 +50,20 @@ class Stitcher:
|
|||
for e in document.iter():
|
||||
self.iter(e)
|
||||
|
||||
"""
|
||||
Check if file has been cached locally and if
|
||||
not cached, download from provided URL. Return
|
||||
download filename
|
||||
"""
|
||||
def _cacheURL(self, url):
|
||||
if not os.path.exists(os.path.basename(url)):
|
||||
_logger.info("Downloading PDF from remote URL %s", url)
|
||||
with urllib.request.urlopen(url) as response, open(os.path.basename(url), 'wb') as downloadedFile:
|
||||
shutil.copyfileobj(response, downloadedFile)
|
||||
else:
|
||||
_logger.info("Locally cached PDF found at %s", os.path.basename(url))
|
||||
return os.path.basename(url)
|
||||
|
||||
"""
|
||||
Get the number of pages in a PDF file
|
||||
"""
|
||||
|
@ -98,10 +117,13 @@ class Stitcher:
|
|||
self.currentLevel = 3
|
||||
elif(tag =='a'):
|
||||
file = element.attrib.get('href')
|
||||
if(validators.url(file)):
|
||||
file = self._cacheURL(file)
|
||||
fit = element.attrib.get('fit', self.defaultFit)
|
||||
rotate = int(element.attrib.get('rotate', self.defaultRotate))
|
||||
start = int(element.attrib.get('start', self.defaultStart))
|
||||
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file) if self.defaultEnd is None else self.defaultEnd))
|
||||
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file)
|
||||
if self.defaultEnd is None else self.defaultEnd))
|
||||
filters = (rotate, start, end)
|
||||
b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
|
||||
self.files.append((file, self.currentPage, filters))
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
existing_bookmarks: remove
|
||||
author: Wiki, the Cat
|
||||
subject: A book about adventures of Wiki, the cat.
|
||||
keywords: wiki,potato,jelly
|
||||
# Super Potato Book
|
||||
|
||||
# Volume 1
|
||||
|
||||
[Part 1](1.pdf)
|
||||
|
||||
# Volume 2
|
||||
|
||||
[Part 2](https://unec.edu.az/application/uploads/2014/12/pdf-sample.pdf)
|
||||
|
||||
# Volume 3
|
||||
|
||||
[Part 3](https://juventudedesporto.cplp.org/files/sample-pdf_9359.pdf)
|
Loading…
Reference in New Issue