mirror of
https://github.com/captn3m0/pystitcher.git
synced 2024-09-16 16:40:18 +00:00
Merge pull request #11 from captn3m0/feature/external_url
Add basic implementation of external URL fetching of PDFs
This commit is contained in:
commit
af4752bee1
@ -41,11 +41,13 @@ python_requires = >=3.6
|
|||||||
# PyPDF3: Read and write PDF files
|
# PyPDF3: Read and write PDF files
|
||||||
# Markdown: Render input markdown file to HTML
|
# Markdown: Render input markdown file to HTML
|
||||||
# html5: Parse HTML file to generate bookmarks
|
# html5: Parse HTML file to generate bookmarks
|
||||||
|
# validators: Validate URL for fetching external PDF
|
||||||
install_requires =
|
install_requires =
|
||||||
importlib-metadata; python_version<"3.8"
|
importlib-metadata; python_version<"3.8"
|
||||||
PyPDF3>=1.0.4
|
PyPDF3>=1.0.4
|
||||||
Markdown>=3.3.4
|
Markdown>=3.3.4
|
||||||
html5>=0.0.9
|
html5>=0.0.9
|
||||||
|
validators>=0.18.1
|
||||||
|
|
||||||
[options.packages.find]
|
[options.packages.find]
|
||||||
where = src
|
where = src
|
||||||
|
@ -1,12 +1,17 @@
|
|||||||
import os
|
import os
|
||||||
import markdown
|
import logging
|
||||||
from .bookmark import Bookmark
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import urllib.request
|
||||||
|
import validators
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
|
import markdown
|
||||||
|
|
||||||
from PyPDF3 import PdfFileWriter, PdfFileReader
|
from PyPDF3 import PdfFileWriter, PdfFileReader
|
||||||
from PyPDF3.generic import FloatObject
|
from PyPDF3.generic import FloatObject
|
||||||
from pystitcher import __version__
|
from pystitcher import __version__
|
||||||
import tempfile
|
from .bookmark import Bookmark
|
||||||
import logging
|
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -45,6 +50,20 @@ class Stitcher:
|
|||||||
for e in document.iter():
|
for e in document.iter():
|
||||||
self.iter(e)
|
self.iter(e)
|
||||||
|
|
||||||
|
"""
|
||||||
|
Check if file has been cached locally and if
|
||||||
|
not cached, download from provided URL. Return
|
||||||
|
download filename
|
||||||
|
"""
|
||||||
|
def _cacheURL(self, url):
|
||||||
|
if not os.path.exists(os.path.basename(url)):
|
||||||
|
_logger.info("Downloading PDF from remote URL %s", url)
|
||||||
|
with urllib.request.urlopen(url) as response, open(os.path.basename(url), 'wb') as downloadedFile:
|
||||||
|
shutil.copyfileobj(response, downloadedFile)
|
||||||
|
else:
|
||||||
|
_logger.info("Locally cached PDF found at %s", os.path.basename(url))
|
||||||
|
return os.path.basename(url)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Get the number of pages in a PDF file
|
Get the number of pages in a PDF file
|
||||||
"""
|
"""
|
||||||
@ -98,10 +117,13 @@ class Stitcher:
|
|||||||
self.currentLevel = 3
|
self.currentLevel = 3
|
||||||
elif(tag =='a'):
|
elif(tag =='a'):
|
||||||
file = element.attrib.get('href')
|
file = element.attrib.get('href')
|
||||||
|
if(validators.url(file)):
|
||||||
|
file = self._cacheURL(file)
|
||||||
fit = element.attrib.get('fit', self.defaultFit)
|
fit = element.attrib.get('fit', self.defaultFit)
|
||||||
rotate = int(element.attrib.get('rotate', self.defaultRotate))
|
rotate = int(element.attrib.get('rotate', self.defaultRotate))
|
||||||
start = int(element.attrib.get('start', self.defaultStart))
|
start = int(element.attrib.get('start', self.defaultStart))
|
||||||
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file) if self.defaultEnd is None else self.defaultEnd))
|
end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file)
|
||||||
|
if self.defaultEnd is None else self.defaultEnd))
|
||||||
filters = (rotate, start, end)
|
filters = (rotate, start, end)
|
||||||
b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
|
b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
|
||||||
self.files.append((file, self.currentPage, filters))
|
self.files.append((file, self.currentPage, filters))
|
||||||
|
17
tests/book-external-url.md
Normal file
17
tests/book-external-url.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
existing_bookmarks: remove
|
||||||
|
author: Wiki, the Cat
|
||||||
|
subject: A book about adventures of Wiki, the cat.
|
||||||
|
keywords: wiki,potato,jelly
|
||||||
|
# Super Potato Book
|
||||||
|
|
||||||
|
# Volume 1
|
||||||
|
|
||||||
|
[Part 1](1.pdf)
|
||||||
|
|
||||||
|
# Volume 2
|
||||||
|
|
||||||
|
[Part 2](https://unec.edu.az/application/uploads/2014/12/pdf-sample.pdf)
|
||||||
|
|
||||||
|
# Volume 3
|
||||||
|
|
||||||
|
[Part 3](https://juventudedesporto.cplp.org/files/sample-pdf_9359.pdf)
|
Loading…
Reference in New Issue
Block a user