Merge pull request #11 from captn3m0/feature/external_url

Add basic implementation of external URL fetching of PDFs
This commit is contained in:
Vonter 2021-06-27 20:51:10 +05:30 committed by GitHub
commit af4752bee1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 46 additions and 5 deletions

View File

@ -41,11 +41,13 @@ python_requires = >=3.6
# PyPDF3: Read and write PDF files
# Markdown: Render input markdown file to HTML
# html5: Parse HTML file to generate bookmarks
# validators: Validate URL for fetching external PDF
install_requires =
importlib-metadata; python_version<"3.8"
PyPDF3>=1.0.4
Markdown>=3.3.4
html5>=0.0.9
validators>=0.18.1
[options.packages.find]
where = src

View File

@ -1,12 +1,17 @@
import os
import markdown
from .bookmark import Bookmark
import logging
import shutil
import tempfile
import urllib.request
import validators
import html5lib
import markdown
from PyPDF3 import PdfFileWriter, PdfFileReader
from PyPDF3.generic import FloatObject
from pystitcher import __version__
import tempfile
import logging
from .bookmark import Bookmark
_logger = logging.getLogger(__name__)
@ -45,6 +50,20 @@ class Stitcher:
for e in document.iter():
self.iter(e)
"""
Check if file has been cached locally and if
not cached, download from provided URL. Return
download filename
"""
def _cacheURL(self, url):
if not os.path.exists(os.path.basename(url)):
_logger.info("Downloading PDF from remote URL %s", url)
with urllib.request.urlopen(url) as response, open(os.path.basename(url), 'wb') as downloadedFile:
shutil.copyfileobj(response, downloadedFile)
else:
_logger.info("Locally cached PDF found at %s", os.path.basename(url))
return os.path.basename(url)
"""
Get the number of pages in a PDF file
"""
@ -98,10 +117,13 @@ class Stitcher:
self.currentLevel = 3
elif(tag =='a'):
file = element.attrib.get('href')
if(validators.url(file)):
file = self._cacheURL(file)
fit = element.attrib.get('fit', self.defaultFit)
rotate = int(element.attrib.get('rotate', self.defaultRotate))
start = int(element.attrib.get('start', self.defaultStart))
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file) if self.defaultEnd is None else self.defaultEnd))
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file)
if self.defaultEnd is None else self.defaultEnd))
filters = (rotate, start, end)
b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
self.files.append((file, self.currentPage, filters))

View File

@ -0,0 +1,17 @@
existing_bookmarks: remove
author: Wiki, the Cat
subject: A book about adventures of Wiki, the cat.
keywords: wiki,potato,jelly
# Super Potato Book
# Volume 1
[Part 1](1.pdf)
# Volume 2
[Part 2](https://unec.edu.az/application/uploads/2014/12/pdf-sample.pdf)
# Volume 3
[Part 3](https://juventudedesporto.cplp.org/files/sample-pdf_9359.pdf)