Merge pull request #11 from captn3m0/feature/external_url

Add basic implementation of external URL fetching of PDFs
2024-09-07 22:46:39 +00:00 · 2021-06-27 20:51:10 +05:30 · 2021-06-27 20:51:10 +05:30 · af4752bee1
commit af4752bee1
parent ebc9c1e0cf 052060d256
3 changed files with 46 additions and 5 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -41,11 +41,13 @@ python_requires = >=3.6
 # PyPDF3: Read and write PDF files
 # Markdown: Render input markdown file to HTML
 # html5: Parse HTML file to generate bookmarks
 # validators: Validate URL for fetching external PDF
 install_requires =
    importlib-metadata; python_version<"3.8"
    PyPDF3>=1.0.4
    Markdown>=3.3.4
    html5>=0.0.9
    validators>=0.18.1
 [options.packages.find]
 where = src
--- a/src/pystitcher/stitcher.py
+++ b/src/pystitcher/stitcher.py
@ -1,12 +1,17 @@
 import os
-import markdown
+import logging
-from .bookmark import Bookmark
+import shutil
 import tempfile
 import urllib.request
 import validators
 import html5lib
 import markdown
 from PyPDF3 import PdfFileWriter, PdfFileReader
 from PyPDF3.generic import FloatObject
 from pystitcher import __version__
-import tempfile
+from .bookmark import Bookmark
 import logging
 _logger = logging.getLogger(__name__)
@ -45,6 +50,20 @@ class Stitcher:
        for e in document.iter():
            self.iter(e)
    """
    Check if file has been cached locally and if
    not cached, download from provided URL. Return
    download filename
    """
    def _cacheURL(self, url):
        if not os.path.exists(os.path.basename(url)):
            _logger.info("Downloading PDF from remote URL %s", url)
            with urllib.request.urlopen(url) as response, open(os.path.basename(url), 'wb') as downloadedFile:
                shutil.copyfileobj(response, downloadedFile)
        else:
            _logger.info("Locally cached PDF found at %s", os.path.basename(url))
        return os.path.basename(url)
    """
    Get the number of pages in a PDF file
    """
@ -98,10 +117,13 @@ class Stitcher:
            self.currentLevel = 3
        elif(tag =='a'):
            file = element.attrib.get('href')
            if(validators.url(file)):
                file = self._cacheURL(file)
            fit = element.attrib.get('fit', self.defaultFit)
            rotate = int(element.attrib.get('rotate', self.defaultRotate))
            start = int(element.attrib.get('start', self.defaultStart))
-            end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file) if self.defaultEnd is None else self.defaultEnd))
+            end = int(element.attrib.get('end', self._get_pdf_number_of_pages(file)
                                         if self.defaultEnd is None else self.defaultEnd))
            filters = (rotate, start, end)
            b = Bookmark(self.currentPage, element.text, self.currentLevel+1, fit)
            self.files.append((file, self.currentPage, filters))
--- a/tests/book-external-url.md
+++ b/tests/book-external-url.md
@ -0,0 +1,17 @@
 existing_bookmarks: remove
 author: Wiki, the Cat
 subject: A book about adventures of Wiki, the cat.
 keywords: wiki,potato,jelly
 # Super Potato Book
 # Volume 1
 [Part 1](1.pdf)
 # Volume 2
 [Part 2](https://unec.edu.az/application/uploads/2014/12/pdf-sample.pdf)
 # Volume 3
 [Part 3](https://juventudedesporto.cplp.org/files/sample-pdf_9359.pdf)