Merge pull request #11 from captn3m0/feature/external_url

Add basic implementation of external URL fetching of PDFs
This commit is contained in:
Vonter 2021-06-27 20:51:10 +05:30 committed by GitHub
commit af4752bee1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 46 additions and 5 deletions

View File

@ -41,11 +41,13 @@ python_requires = >=3.6
# PyPDF3: Read and write PDF files # PyPDF3: Read and write PDF files
# Markdown: Render input markdown file to HTML # Markdown: Render input markdown file to HTML
# html5: Parse HTML file to generate bookmarks # html5: Parse HTML file to generate bookmarks
# validators: Validate URL for fetching external PDF
install_requires = install_requires =
importlib-metadata; python_version<"3.8" importlib-metadata; python_version<"3.8"
PyPDF3>=1.0.4 PyPDF3>=1.0.4
Markdown>=3.3.4 Markdown>=3.3.4
html5>=0.0.9 html5>=0.0.9
validators>=0.18.1
[options.packages.find] [options.packages.find]
where = src where = src

View File

@ -1,12 +1,17 @@
import os import os
import markdown import logging
from .bookmark import Bookmark import shutil
import tempfile
import urllib.request
import validators
import html5lib import html5lib
import markdown
from PyPDF3 import PdfFileWriter, PdfFileReader from PyPDF3 import PdfFileWriter, PdfFileReader
from PyPDF3.generic import FloatObject from PyPDF3.generic import FloatObject
from pystitcher import __version__ from pystitcher import __version__
import tempfile from .bookmark import Bookmark
import logging
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
@ -45,6 +50,20 @@ class Stitcher:
for e in document.iter(): for e in document.iter():
self.iter(e) self.iter(e)
"""
Check if file has been cached locally and if
not cached, download from provided URL. Return
download filename
"""
def _cacheURL(self, url):
if not os.path.exists(os.path.basename(url)):
_logger.info("Downloading PDF from remote URL %s", url)
with urllib.request.urlopen(url) as response, open(os.path.basename(url), 'wb') as downloadedFile:
shutil.copyfileobj(response, downloadedFile)
else:
_logger.info("Locally cached PDF found at %s", os.path.basename(url))
return os.path.basename(url)
""" """
Get the number of pages in a PDF file Get the number of pages in a PDF file
""" """
@ -98,10 +117,13 @@ class Stitcher:
self.currentLevel = 3 self.currentLevel = 3
elif(tag =='a'): elif(tag =='a'):
file = element.attrib.get('href') file = element.attrib.get('href')
if(validators.url(file)):
file = self._cacheURL(file)
fit = element.attrib.get('fit', self.defaultFit) fit = element.attrib.get('fit', self.defaultFit)
rotate = int(element.attrib.get('rotate', self.defaultRotate)) rotate = int(element.attrib.get('rotate', self.defaultRotate))
start = int(element.attrib.get('start', self.defaultStart)) start = int(element.attrib.get('start', self.defaultStart))
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file) if self.defaultEnd is None else self.defaultEnd)) end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file)
if self.defaultEnd is None else self.defaultEnd))
filters = (rotate, start, end) filters = (rotate, start, end)
b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit) b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
self.files.append((file, self.currentPage, filters)) self.files.append((file, self.currentPage, filters))

View File

@ -0,0 +1,17 @@
existing_bookmarks: remove
author: Wiki, the Cat
subject: A book about adventures of Wiki, the cat.
keywords: wiki,potato,jelly
# Super Potato Book
# Volume 1
[Part 1](1.pdf)
# Volume 2
[Part 2](https://unec.edu.az/application/uploads/2014/12/pdf-sample.pdf)
# Volume 3
[Part 3](https://juventudedesporto.cplp.org/files/sample-pdf_9359.pdf)