Add external URL fetching of PDFs
Also changed import order according to PEP8
This commit is contained in:
parent
ebc9c1e0cf
commit
31faa1a36c
|
@ -1,12 +1,17 @@
|
||||||
import os
|
import os
|
||||||
import markdown
|
import logging
|
||||||
from .bookmark import Bookmark
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import urllib.request
|
||||||
|
import validators
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
|
import markdown
|
||||||
|
|
||||||
from PyPDF3 import PdfFileWriter, PdfFileReader
|
from PyPDF3 import PdfFileWriter, PdfFileReader
|
||||||
from PyPDF3.generic import FloatObject
|
from PyPDF3.generic import FloatObject
|
||||||
from pystitcher import __version__
|
from pystitcher import __version__
|
||||||
import tempfile
|
from .bookmark import Bookmark
|
||||||
import logging
|
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -45,6 +50,20 @@ class Stitcher:
|
||||||
for e in document.iter():
|
for e in document.iter():
|
||||||
self.iter(e)
|
self.iter(e)
|
||||||
|
|
||||||
|
"""
|
||||||
|
Check if file has been cached locally and if
|
||||||
|
not cached, download from provided URL. Return
|
||||||
|
download filename
|
||||||
|
"""
|
||||||
|
def _cacheURL(self, url):
|
||||||
|
if not os.path.exists(os.path.basename(url)):
|
||||||
|
_logger.info("Downloading PDF from remote URL %s", url)
|
||||||
|
with urllib.request.urlopen(url) as response, open(os.path.basename(url), 'wb') as downloadedFile:
|
||||||
|
shutil.copyfileobj(response, downloadedFile)
|
||||||
|
else:
|
||||||
|
_logger.info("Locally cached PDF found at %s", url)
|
||||||
|
return os.path.basename(url)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Get the number of pages in a PDF file
|
Get the number of pages in a PDF file
|
||||||
"""
|
"""
|
||||||
|
@ -98,10 +117,13 @@ class Stitcher:
|
||||||
self.currentLevel = 3
|
self.currentLevel = 3
|
||||||
elif(tag =='a'):
|
elif(tag =='a'):
|
||||||
file = element.attrib.get('href')
|
file = element.attrib.get('href')
|
||||||
|
if(validators.url(file)):
|
||||||
|
file = self._cacheURL(file)
|
||||||
fit = element.attrib.get('fit', self.defaultFit)
|
fit = element.attrib.get('fit', self.defaultFit)
|
||||||
rotate = int(element.attrib.get('rotate', self.defaultRotate))
|
rotate = int(element.attrib.get('rotate', self.defaultRotate))
|
||||||
start = int(element.attrib.get('start', self.defaultStart))
|
start = int(element.attrib.get('start', self.defaultStart))
|
||||||
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file) if self.defaultEnd is None else self.defaultEnd))
|
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file)
|
||||||
|
if self.defaultEnd is None else self.defaultEnd))
|
||||||
filters = (rotate, start, end)
|
filters = (rotate, start, end)
|
||||||
b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
|
b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
|
||||||
self.files.append((file, self.currentPage, filters))
|
self.files.append((file, self.currentPage, filters))
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
existing_bookmarks: remove
|
||||||
|
author: Wiki, the Cat
|
||||||
|
subject: A book about adventures of Wiki, the cat.
|
||||||
|
keywords: wiki,potato,jelly
|
||||||
|
# Super Potato Book
|
||||||
|
|
||||||
|
# Volume 1
|
||||||
|
|
||||||
|
[Part 1](1.pdf)
|
||||||
|
|
||||||
|
# Volume 2
|
||||||
|
|
||||||
|
[Part 2](https://unec.edu.az/application/uploads/2014/12/pdf-sample.pdf)
|
||||||
|
|
||||||
|
# Volume 3
|
||||||
|
|
||||||
|
[Part 3](https://juventudedesporto.cplp.org/files/sample-pdf_9359.pdf)
|
Loading…
Reference in New Issue