scripts/epub.py

370 lines
11 KiB
Python
Executable File

#!/usr/bin/env python
'''
python/curses epub reader. Requires BeautifulSoup
Keyboard commands:
Esc/q - quit
Tab/Left/Right - toggle between TOC and chapter views
TOC view:
Up - up a line
Down - down a line
PgUp - up a page
PgDown - down a page
Chapter view:
Up - up a page
Down - down a page
PgUp - up a line
PgDown - down a line
i - open images on page in web browser
'''
import curses.wrapper, curses.ascii
import formatter, htmllib, locale, os, StringIO, re, readline, tempfile, zipfile
import base64, webbrowser
from BeautifulSoup import BeautifulSoup
try:
from fabulous import image
import PIL
except ImportError:
images = False
else:
images = True
locale.setlocale(locale.LC_ALL, 'en_US.utf-8')
basedir = ''
def run(screen, program, *args):
curses.nocbreak()
screen.keypad(0)
curses.echo()
pid = os.fork()
if not pid:
os.execvp(program, (program,) + args)
os.wait()[0]
curses.noecho()
screen.keypad(1)
curses.cbreak()
def open_image(screen, name, s):
''' show images with PIL and fabulous '''
if not images:
screen.addstr(0, 0, "missing PIL or fabulous", curses.A_REVERSE)
return
ext = os.path.splitext(name)[1]
screen.erase()
screen.refresh()
curses.setsyx(0, 0)
image_file = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
image_file.write(s)
image_file.close()
try:
print image.Image(image_file.name)
except:
print image_file.name
finally:
os.unlink(image_file.name)
def textify(html_snippet, img_size=(80, 45), maxcol=72):
''' text dump of html '''
class Parser(htmllib.HTMLParser):
def anchor_end(self):
self.anchor = None
def handle_image(self, source, alt, ismap, alight, width, height):
global basedir
self.handle_data(
'[img="{0}{1}" "{2}"]'.format(basedir, source, alt)
)
class Formatter(formatter.AbstractFormatter):
pass
class Writer(formatter.DumbWriter):
def __init__(self, fl, maxcol=72):
formatter.DumbWriter.__init__(self, fl)
self.maxcol = maxcol
def send_label_data(self, data):
self.send_flowing_data(data)
self.send_flowing_data(' ')
o = StringIO.StringIO()
p = Parser(Formatter(Writer(o, maxcol)))
p.feed(html_snippet)
p.close()
return o.getvalue()
def table_of_contents(fl):
global basedir
# find opf file
soup = BeautifulSoup(fl.read('META-INF/container.xml'))
opf = dict(soup.find('rootfile').attrs)['full-path']
basedir = os.path.dirname(opf)
if basedir:
basedir = '{0}/'.format(basedir)
soup = BeautifulSoup(fl.read(opf))
# title
yield (soup.find('dc:title').text, None)
# all files, not in order
x, ncx = {}, None
for item in soup.find('manifest').findAll('item'):
d = dict(item.attrs)
x[d['id']] = '{0}{1}'.format(basedir, d['href'])
if d['media-type'] == 'application/x-dtbncx+xml':
ncx = '{0}{1}'.format(basedir, d['href'])
# reading order, not all files
y = []
for item in soup.find('spine').findAll('itemref'):
y.append(x[dict(item.attrs)['idref']])
z = {}
if ncx:
# get titles from the toc
soup = BeautifulSoup(fl.read(ncx))
for navpoint in soup('navpoint'):
k = navpoint.content.get('src', None)
# strip off any anchor text
k = k.split('#')[0]
if k:
z[k] = navpoint.navlabel.text
# output
for section in y:
if section in z:
yield (z[section].encode('utf-8'), section.encode('utf-8'))
else:
yield (u'', section.encode('utf-8').strip())
def list_chaps(screen, chaps, start, length):
for i, (title, src) in enumerate(chaps[start:start+length]):
try:
if start == 0:
screen.addstr(i, 0, ' {0}'.format(title), curses.A_BOLD)
else:
screen.addstr(i, 0, '{0:-5} {1}'.format(start, title))
except:
pass
start += 1
screen.refresh()
return i
def check_epub(fl):
if os.path.isfile(fl) and os.path.splitext(fl)[1].lower() == '.epub':
return True
def dump_epub(fl, maxcol=float("+inf")):
if not check_epub(fl):
return
fl = zipfile.ZipFile(fl, 'r')
chaps = [i for i in table_of_contents(fl)]
for title, src in chaps:
print title
print '-' * len(title)
if src:
soup = BeautifulSoup(fl.read(src))
print textify(
unicode(soup.find('body')).encode('utf-8'),
maxcol=maxcol,
)
print '\n'
def curses_epub(screen, fl):
if not check_epub(fl):
return
#curses.mousemask(curses.BUTTON1_CLICKED)
fl = zipfile.ZipFile(fl, 'r')
chaps = [i for i in table_of_contents(fl)]
chaps_pos = [0 for i in chaps]
start = 0
cursor_row = 0
# toc
while True:
curses.curs_set(1)
maxy, maxx = screen.getmaxyx()
if cursor_row >= maxy:
cursor_row = maxy - 1
len_chaps = list_chaps(screen, chaps, start, maxy)
screen.move(cursor_row, 0)
ch = screen.getch()
# quit
if ch == curses.ascii.ESC:
return
try:
if chr(ch) == 'q':
return
except:
pass
# up/down line
if ch in [curses.KEY_DOWN]:
if start < len(chaps) - maxy:
start += 1
screen.clear()
elif cursor_row < maxy - 1 and cursor_row < len_chaps:
cursor_row += 1
elif ch in [curses.KEY_UP]:
if start > 0:
start -= 1
screen.clear()
elif cursor_row > 0:
cursor_row -= 1
# up/down page
elif ch in [curses.KEY_NPAGE]:
if start + maxy - 1 < len(chaps):
start += maxy - 1
if len_chaps < maxy:
start = len(chaps) - maxy
screen.clear()
elif ch in [curses.KEY_PPAGE]:
if start > 0:
start -= maxy - 1
if start < 0:
start = 0
screen.clear()
# to chapter
elif ch in [curses.ascii.HT, curses.KEY_RIGHT, curses.KEY_LEFT]:
if chaps[start + cursor_row][1]:
html = fl.read(chaps[start + cursor_row][1])
soup = BeautifulSoup(html)
chap = textify(
unicode(soup.find('body')).encode('utf-8'),
img_size=screen.getmaxyx(),
maxcol=screen.getmaxyx()[1]
).split('\n')
else:
chap = ''
screen.clear()
curses.curs_set(0)
# chapter
while True:
maxy, maxx = screen.getmaxyx()
images = []
for i, line in enumerate(chap[
chaps_pos[start + cursor_row]:
chaps_pos[start + cursor_row] + maxy
]):
try:
screen.addstr(i, 0, line)
mch = re.search('\[img="([^"]+)" "([^"]*)"\]', line)
if mch:
images.append(mch.group(1))
except:
pass
screen.refresh()
ch = screen.getch()
# quit
if ch == curses.ascii.ESC:
return
try:
if chr(ch) == 'q':
return
except:
pass
# to TOC
if ch in [curses.ascii.HT, curses.KEY_RIGHT, curses.KEY_LEFT]:
screen.clear()
break
# up/down page
elif ch in [curses.KEY_DOWN]:
if chaps_pos[start + cursor_row] + maxy - 1 < len(chap):
chaps_pos[start + cursor_row] += maxy - 1
screen.clear()
elif ch in [curses.KEY_UP]:
if chaps_pos[start + cursor_row] > 0:
chaps_pos[start + cursor_row] -= maxy - 1
if chaps_pos[start + cursor_row] < 0:
chaps_pos[start + cursor_row] = 0
screen.clear()
# up/down line
elif ch in [curses.KEY_NPAGE]:
if chaps_pos[start + cursor_row] + maxy - 1 < len(chap):
chaps_pos[start + cursor_row] += 1
screen.clear()
elif ch in [curses.KEY_PPAGE]:
if chaps_pos[start + cursor_row] > 0:
chaps_pos[start + cursor_row] -= 1
screen.clear()
#elif ch in [curses.KEY_MOUSE]:
# id, x, y, z, bstate = curses.getmouse()
# line = screen.instr(y, 0)
# mch = re.search('\[img="([^"]+)" "([^"]*)"\]', line)
# if mch:
# img_fl = mch.group(1)
else:
try:
if chr(ch) == 'i':
for img in images:
err = open_image(screen, img, fl.read(img))
if err:
screen.addstr(0, 0, err, curses.A_REVERSE)
# edit html
elif chr(ch) == 'e':
tmpfl = tempfile.NamedTemporaryFile(delete=False)
tmpfl.write(html)
tmpfl.close()
run(screen, 'vim', tmpfl.name)
with open(tmpfl.name) as changed:
new_html = changed.read()
os.unlink(tmpfl.name)
if new_html != html:
pass
# write to zipfile?
# go back to TOC
screen.clear()
break
except (ValueError, IndexError):
pass
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__,
)
parser.add_argument('-d', '--dump', action='store_true',
help='dump EPUB to text')
parser.add_argument('-c', '--cols', action='store', type=int, default=float("+inf"),
help='Number of columns to wrap; default is no wrapping.')
parser.add_argument('EPUB', help='view EPUB')
args = parser.parse_args()
if args.EPUB:
if args.dump:
dump_epub(args.EPUB, args.cols)
else:
try:
curses.wrapper(curses_epub, args.EPUB)
except KeyboardInterrupt:
pass