Script to read epubs in console.

2014-09-03 06:04:35 +05:30 · 2014-09-03 06:04:35 +05:30 · e21f8b5739
parent ee0314b046
commit e21f8b5739
1 changed files with 369 additions and 0 deletions
--- a/epub.py
+++ b/epub.py
@ -0,0 +1,369 @@
+#!/usr/bin/env python
+'''
+python/curses epub reader. Requires BeautifulSoup
+
+Keyboard commands:
+    Esc/q          - quit
+    Tab/Left/Right - toggle between TOC and chapter views
+    TOC view:
+        Up         - up a line
+        Down       - down a line
+        PgUp       - up a page
+        PgDown     - down a page
+    Chapter view:
+        Up         - up a page
+        Down       - down a page
+        PgUp       - up a line
+        PgDown     - down a line
+        i          - open images on page in web browser
+'''
+
+import curses.wrapper, curses.ascii
+import formatter, htmllib, locale, os, StringIO, re, readline, tempfile, zipfile
+import base64, webbrowser
+
+from BeautifulSoup import BeautifulSoup
+
+try:
+    from fabulous import image
+    import PIL
+except ImportError:
+    images = False
+else:
+    images = True
+
+locale.setlocale(locale.LC_ALL, 'en_US.utf-8')
+
+basedir = ''
+
+def run(screen, program, *args):
+    curses.nocbreak()
+    screen.keypad(0)
+    curses.echo()
+    pid = os.fork()
+    if not pid:
+        os.execvp(program, (program,) +  args)
+    os.wait()[0]
+    curses.noecho()
+    screen.keypad(1)
+    curses.cbreak()
+
+def open_image(screen, name, s):
+    ''' show images with PIL and fabulous '''
+    if not images:
+        screen.addstr(0, 0, "missing PIL or fabulous", curses.A_REVERSE)
+        return
+
+    ext = os.path.splitext(name)[1]
+
+    screen.erase()
+    screen.refresh()
+    curses.setsyx(0, 0)
+    image_file = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
+    image_file.write(s)
+    image_file.close()
+    try:
+        print image.Image(image_file.name)
+    except:
+        print image_file.name
+    finally:
+        os.unlink(image_file.name)
+
+def textify(html_snippet, img_size=(80, 45), maxcol=72):
+    ''' text dump of html '''
+    class Parser(htmllib.HTMLParser):
+        def anchor_end(self):
+            self.anchor = None
+        def handle_image(self, source, alt, ismap, alight, width, height):
+            global basedir
+            self.handle_data(
+                '[img="{0}{1}" "{2}"]'.format(basedir, source, alt)
+            )
+
+    class Formatter(formatter.AbstractFormatter):
+        pass
+
+    class Writer(formatter.DumbWriter):
+        def __init__(self, fl, maxcol=72):
+            formatter.DumbWriter.__init__(self, fl)
+            self.maxcol = maxcol
+        def send_label_data(self, data):
+            self.send_flowing_data(data)
+            self.send_flowing_data(' ')
+
+    o = StringIO.StringIO()
+    p = Parser(Formatter(Writer(o, maxcol)))
+    p.feed(html_snippet)
+    p.close()
+
+    return o.getvalue()
+
+def table_of_contents(fl):
+    global basedir
+
+    # find opf file
+    soup = BeautifulSoup(fl.read('META-INF/container.xml'))
+    opf = dict(soup.find('rootfile').attrs)['full-path']
+
+    basedir = os.path.dirname(opf)
+    if basedir:
+        basedir = '{0}/'.format(basedir)
+
+    soup =  BeautifulSoup(fl.read(opf))
+
+    # title
+    yield (soup.find('dc:title').text, None)
+
+    # all files, not in order
+    x, ncx = {}, None
+    for item in soup.find('manifest').findAll('item'):
+        d = dict(item.attrs)
+        x[d['id']] = '{0}{1}'.format(basedir, d['href'])
+        if d['media-type'] == 'application/x-dtbncx+xml':
+            ncx = '{0}{1}'.format(basedir, d['href'])
+
+    # reading order, not all files
+    y = []
+    for item in soup.find('spine').findAll('itemref'):
+        y.append(x[dict(item.attrs)['idref']])
+
+    z = {}
+    if ncx:
+        # get titles from the toc
+        soup =  BeautifulSoup(fl.read(ncx))
+
+        for navpoint in soup('navpoint'):
+            k = navpoint.content.get('src', None)
+            # strip off any anchor text
+            k = k.split('#')[0]
+            if k:
+                z[k] = navpoint.navlabel.text
+
+    # output
+    for section in y:
+        if section in z:
+            yield (z[section].encode('utf-8'), section.encode('utf-8'))
+        else:
+            yield (u'', section.encode('utf-8').strip())
+
+def list_chaps(screen, chaps, start, length):
+    for i, (title, src) in enumerate(chaps[start:start+length]):
+        try:
+            if start == 0:
+                screen.addstr(i, 0, '      {0}'.format(title), curses.A_BOLD)
+            else:
+                screen.addstr(i, 0, '{0:-5} {1}'.format(start, title))
+        except:
+            pass
+        start += 1
+    screen.refresh()
+    return i
+
+def check_epub(fl):
+    if os.path.isfile(fl) and os.path.splitext(fl)[1].lower() == '.epub':
+        return True
+
+def dump_epub(fl, maxcol=float("+inf")):
+    if not check_epub(fl):
+        return
+    fl = zipfile.ZipFile(fl, 'r')
+    chaps = [i for i in table_of_contents(fl)]
+    for title, src in chaps:
+        print title
+        print '-' * len(title)
+        if src:
+            soup = BeautifulSoup(fl.read(src))
+            print textify(
+                unicode(soup.find('body')).encode('utf-8'),
+                maxcol=maxcol,
+            )
+        print '\n'
+
+def curses_epub(screen, fl):
+    if not check_epub(fl):
+        return
+
+    #curses.mousemask(curses.BUTTON1_CLICKED)
+
+    fl = zipfile.ZipFile(fl, 'r')
+    chaps = [i for i in table_of_contents(fl)]
+    chaps_pos = [0 for i in chaps]
+    start = 0
+    cursor_row = 0
+
+    # toc
+    while True:
+        curses.curs_set(1)
+        maxy, maxx = screen.getmaxyx()
+
+        if cursor_row >= maxy:
+            cursor_row = maxy - 1
+
+        len_chaps = list_chaps(screen, chaps, start, maxy)
+        screen.move(cursor_row, 0)
+        ch = screen.getch()
+
+        # quit
+        if ch == curses.ascii.ESC:
+            return
+        try:
+           if chr(ch) == 'q':
+               return
+        except:
+            pass
+
+        # up/down line
+        if ch in [curses.KEY_DOWN]:
+            if start < len(chaps) - maxy:
+                start += 1
+                screen.clear()
+            elif cursor_row < maxy - 1 and cursor_row < len_chaps:
+                cursor_row += 1
+        elif ch in [curses.KEY_UP]:
+            if start > 0:
+                start -= 1
+                screen.clear()
+            elif cursor_row > 0:
+                cursor_row -= 1
+
+        # up/down page
+        elif ch in [curses.KEY_NPAGE]:
+            if start + maxy - 1 < len(chaps):
+                start += maxy - 1
+                if len_chaps < maxy:
+                    start = len(chaps) - maxy
+                screen.clear()
+        elif ch in [curses.KEY_PPAGE]:
+            if start > 0:
+                start -= maxy - 1
+                if start < 0:
+                    start = 0
+                screen.clear()
+
+        # to chapter
+        elif ch in [curses.ascii.HT, curses.KEY_RIGHT, curses.KEY_LEFT]:
+            if chaps[start + cursor_row][1]:
+                html = fl.read(chaps[start + cursor_row][1])
+                soup = BeautifulSoup(html)
+                chap = textify(
+                    unicode(soup.find('body')).encode('utf-8'),
+                    img_size=screen.getmaxyx(),
+                    maxcol=screen.getmaxyx()[1]
+                ).split('\n')
+            else:
+                chap = ''
+            screen.clear()
+            curses.curs_set(0)
+
+            # chapter
+            while True:
+                maxy, maxx = screen.getmaxyx()
+                images = []
+                for i, line in enumerate(chap[
+                    chaps_pos[start + cursor_row]:
+                    chaps_pos[start + cursor_row] + maxy
+                ]):
+                    try:
+                        screen.addstr(i, 0, line)
+                        mch = re.search('\[img="([^"]+)" "([^"]*)"\]', line)
+                        if mch:
+                            images.append(mch.group(1))
+                    except:
+                        pass
+                screen.refresh()
+                ch = screen.getch()
+
+                # quit
+                if ch == curses.ascii.ESC:
+                    return
+                try:
+                   if chr(ch) == 'q':
+                       return
+                except:
+                    pass
+
+                # to TOC
+                if ch in [curses.ascii.HT, curses.KEY_RIGHT, curses.KEY_LEFT]:
+                    screen.clear()
+                    break
+
+                # up/down page
+                elif ch in [curses.KEY_DOWN]:
+                    if chaps_pos[start + cursor_row] + maxy - 1 < len(chap):
+                        chaps_pos[start + cursor_row] += maxy - 1
+                        screen.clear()
+                elif ch in [curses.KEY_UP]:
+                    if chaps_pos[start + cursor_row] > 0:
+                        chaps_pos[start + cursor_row] -= maxy - 1
+                        if chaps_pos[start + cursor_row] < 0:
+                            chaps_pos[start + cursor_row] = 0
+                        screen.clear()
+
+                # up/down line
+                elif ch in [curses.KEY_NPAGE]:
+                    if chaps_pos[start + cursor_row] + maxy - 1 < len(chap):
+                        chaps_pos[start + cursor_row] += 1
+                        screen.clear()
+                elif ch in [curses.KEY_PPAGE]:
+                    if chaps_pos[start + cursor_row] > 0:
+                        chaps_pos[start + cursor_row] -= 1
+                        screen.clear()
+
+                #elif ch in [curses.KEY_MOUSE]:
+                #    id, x, y, z, bstate = curses.getmouse()
+                #    line = screen.instr(y, 0)
+                #    mch = re.search('\[img="([^"]+)" "([^"]*)"\]', line)
+                #    if mch:
+                #            img_fl = mch.group(1)
+
+                else:
+                    try:
+                        if chr(ch) == 'i':
+                            for img in images:
+                                err = open_image(screen, img, fl.read(img))
+                                if err:
+                                    screen.addstr(0, 0, err, curses.A_REVERSE)
+
+                        # edit html
+                        elif chr(ch) == 'e':
+
+                            tmpfl = tempfile.NamedTemporaryFile(delete=False)
+                            tmpfl.write(html)
+                            tmpfl.close()
+                            run(screen, 'vim', tmpfl.name)
+                            with open(tmpfl.name) as changed:
+                                new_html = changed.read()
+                                os.unlink(tmpfl.name)
+                                if new_html != html:
+                                    pass
+                                    # write to zipfile?
+
+                            # go back to TOC
+                            screen.clear()
+                            break
+
+                    except (ValueError, IndexError):
+                        pass
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=__doc__,
+    )
+    parser.add_argument('-d', '--dump', action='store_true',
+                        help='dump EPUB to text')
+    parser.add_argument('-c', '--cols', action='store', type=int, default=float("+inf"),
+                        help='Number of columns to wrap; default is no wrapping.')
+    parser.add_argument('EPUB', help='view EPUB')
+    args = parser.parse_args()
+
+    if args.EPUB:
+        if args.dump:
+            dump_epub(args.EPUB, args.cols)
+        else:
+            try:
+                curses.wrapper(curses_epub, args.EPUB)
+            except KeyboardInterrupt:
+                pass