From b652cbcab3e236bf70889494f871b5fe1a2d4622 Mon Sep 17 00:00:00 2001 From: Abhay Rana Date: Sun, 15 Mar 2015 17:55:47 +0530 Subject: [PATCH] Helps in CTF problems --- pdf-parser.py | 1031 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1031 insertions(+) create mode 100755 pdf-parser.py diff --git a/pdf-parser.py b/pdf-parser.py new file mode 100755 index 0000000..1a86cc4 --- /dev/null +++ b/pdf-parser.py @@ -0,0 +1,1031 @@ +#!/usr/bin/python + +__description__ = 'pdf-parser, use it to parse a PDF document' +__author__ = 'Didier Stevens' +__version__ = '0.4.3' +__date__ = '2013/09/18' +__minimum_python_version__ = (2, 5, 1) +__maximum_python_version__ = (3, 3, 0) + +""" +Source code put in public domain by Didier Stevens, no Copyright +https://DidierStevens.com +Use at your own risk + +History: + 2008/05/02: continue + 2008/05/03: continue + 2008/06/02: streams + 2008/10/19: refactor, grep & extract functionality + 2008/10/20: reference + 2008/10/21: cleanup + 2008/11/12: V0.3 dictionary parser + 2008/11/13: option elements + 2008/11/14: continue + 2009/05/05: added /ASCIIHexDecode support (thanks Justin Prosco) + 2009/05/11: V0.3.1 updated usage, added --verbose and --extract + 2009/07/16: V0.3.2 Added Canonicalize (thanks Justin Prosco) + 2009/07/18: bugfix EqualCanonical + 2009/07/24: V0.3.3 Added --hash option + 2009/07/25: EqualCanonical for option --type, added option --nocanonicalizedoutput + 2009/07/28: V0.3.4 Added ASCII85Decode support + 2009/08/01: V0.3.5 Updated ASCIIHexDecode to support whitespace obfuscation + 2009/08/30: V0.3.6 TestPythonVersion + 2010/01/08: V0.3.7 Added RLE and LZW support (thanks pARODY); added dump option + 2010/01/09: Fixed parsing of incomplete startxref + 2010/09/22: V0.3.8 Changed dump option, updated PrettyPrint, added debug option + 2011/12/17: fixed bugs empty objects + 2012/03/11: V0.3.9 fixed bugs double nested [] in PrettyPrintSub (thanks kurt) + 2013/01/11: V0.3.10 Extract and dump bug fixes by Priit; added content option + 2013/02/16: Performance improvement in cPDFTokenizer by using StringIO for token building by Christophe Vandeplas; xrange replaced with range + 2013/02/16: V0.4.0 added http/https support; added error handling for missing file or URL; ; added support for ZIP file with password 'infected' + 2013/03/13: V0.4.1 fixes for Python 3 + 2013/04/11: V0.4.2 modified PrettyPrintSub for strings with unprintable characters + 2013/05/04: Added options searchstream, unfiltered, casesensitive, regex + 2013/09/18: V0.4.3 fixed regression bug -w option + +Todo: + - handle printf todo + - fix PrettyPrint + - support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D.pdf.vir + +""" + +import re +import optparse +import zlib +import binascii +import hashlib +import sys +import zipfile +if sys.version_info[0] >= 3: + from io import StringIO + import urllib.request + urllib23 = urllib.request +else: + from cStringIO import StringIO + import urllib2 + urllib23 = urllib2 + +CHAR_WHITESPACE = 1 +CHAR_DELIMITER = 2 +CHAR_REGULAR = 3 + +CONTEXT_NONE = 1 +CONTEXT_OBJ = 2 +CONTEXT_XREF = 3 +CONTEXT_TRAILER = 4 + +PDF_ELEMENT_COMMENT = 1 +PDF_ELEMENT_INDIRECT_OBJECT = 2 +PDF_ELEMENT_XREF = 3 +PDF_ELEMENT_TRAILER = 4 +PDF_ELEMENT_STARTXREF = 5 +PDF_ELEMENT_MALFORMED = 6 + +#Convert 2 Bytes If Python 3 +def C2BIP3(string): + if sys.version_info[0] > 2: + return bytes([ord(x) for x in string]) + else: + return string + +def CopyWithoutWhiteSpace(content): + result = [] + for token in content: + if token[0] != CHAR_WHITESPACE: + result.append(token) + return result + +def Obj2Str(content): + return ''.join(map(lambda x: repr(x[1])[1:-1], CopyWithoutWhiteSpace(content))) + +class cPDFDocument: + def __init__(self, file): + self.file = file + if file.lower().startswith('http://') or file.lower().startswith('https://'): + try: + if sys.hexversion >= 0x020601F0: + self.infile = urllib23.urlopen(file, timeout=5) + else: + self.infile = urllib23.urlopen(file) + except urllib23.HTTPError: + print('Error accessing URL %s' % file) + print(sys.exc_info()[1]) + sys.exit() + elif file.lower().endswith('.zip'): + try: + self.zipfile = zipfile.ZipFile(file, 'r') + self.infile = self.zipfile.open(self.zipfile.infolist()[0], 'r', C2BIP3('infected')) + except: + print('Error opening file %s' % file) + print(sys.exc_info()[1]) + sys.exit() + else: + try: + self.infile = open(file, 'rb') + except: + print('Error opening file %s' % file) + print(sys.exc_info()[1]) + sys.exit() + self.ungetted = [] + self.position = -1 + + def byte(self): + if len(self.ungetted) != 0: + self.position += 1 + return self.ungetted.pop() + inbyte = self.infile.read(1) + if not inbyte or inbyte == '': + self.infile.close() + return None + self.position += 1 + return ord(inbyte) + + def unget(self, byte): + self.position -= 1 + self.ungetted.append(byte) + +def CharacterClass(byte): + if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32: + return CHAR_WHITESPACE + if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25: + return CHAR_DELIMITER + return CHAR_REGULAR + +def IsNumeric(str): + return re.match('^[0-9]+', str) + +class cPDFTokenizer: + def __init__(self, file): + self.oPDF = cPDFDocument(file) + self.ungetted = [] + + def Token(self): + if len(self.ungetted) != 0: + return self.ungetted.pop() + if self.oPDF == None: + return None + self.byte = self.oPDF.byte() + if self.byte == None: + self.oPDF = None + return None + elif CharacterClass(self.byte) == CHAR_WHITESPACE: + file_str = StringIO() + while self.byte != None and CharacterClass(self.byte) == CHAR_WHITESPACE: + file_str.write(chr(self.byte)) + self.byte = self.oPDF.byte() + if self.byte != None: + self.oPDF.unget(self.byte) + else: + self.oPDF = None + self.token = file_str.getvalue() + return (CHAR_WHITESPACE, self.token) + elif CharacterClass(self.byte) == CHAR_REGULAR: + file_str = StringIO() + while self.byte != None and CharacterClass(self.byte) == CHAR_REGULAR: + file_str.write(chr(self.byte)) + self.byte = self.oPDF.byte() + if self.byte != None: + self.oPDF.unget(self.byte) + else: + self.oPDF = None + self.token = file_str.getvalue() + return (CHAR_REGULAR, self.token) + else: + if self.byte == 0x3C: + self.byte = self.oPDF.byte() + if self.byte == 0x3C: + return (CHAR_DELIMITER, '<<') + else: + self.oPDF.unget(self.byte) + return (CHAR_DELIMITER, '<') + elif self.byte == 0x3E: + self.byte = self.oPDF.byte() + if self.byte == 0x3E: + return (CHAR_DELIMITER, '>>') + else: + self.oPDF.unget(self.byte) + return (CHAR_DELIMITER, '>') + elif self.byte == 0x25: + file_str = StringIO() + while self.byte != None: + file_str.write(chr(self.byte)) + if self.byte == 10 or self.byte == 13: + self.byte = self.oPDF.byte() + break + self.byte = self.oPDF.byte() + if self.byte != None: + if self.byte == 10: + file_str.write(chr(self.byte)) + else: + self.oPDF.unget(self.byte) + else: + self.oPDF = None + self.token = file_str.getvalue() + return (CHAR_DELIMITER, self.token) + return (CHAR_DELIMITER, chr(self.byte)) + + def TokenIgnoreWhiteSpace(self): + token = self.Token() + while token != None and token[0] == CHAR_WHITESPACE: + token = self.Token() + return token + + def unget(self, byte): + self.ungetted.append(byte) + +class cPDFParser: + def __init__(self, file, verbose=False, extract=None): + self.context = CONTEXT_NONE + self.content = [] + self.oPDFTokenizer = cPDFTokenizer(file) + self.verbose = verbose + self.extract = extract + + def GetObject(self): + while True: + if self.context == CONTEXT_OBJ: + self.token = self.oPDFTokenizer.Token() + else: + self.token = self.oPDFTokenizer.TokenIgnoreWhiteSpace() + if self.token: + if self.token[0] == CHAR_DELIMITER: + if self.token[1][0] == '%': + if self.context == CONTEXT_OBJ: + self.content.append(self.token) + else: + return cPDFElementComment(self.token[1]) + elif self.token[1] == '/': + self.token2 = self.oPDFTokenizer.Token() + if self.token2[0] == CHAR_REGULAR: + if self.context != CONTEXT_NONE: + self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1])) + elif self.verbose: + print('todo 1: %s' % (self.token[1] + self.token2[1])) + else: + self.oPDFTokenizer.unget(self.token2) + if self.context != CONTEXT_NONE: + self.content.append(self.token) + elif self.verbose: + print('todo 2: %d %s' % (self.token[0], repr(self.token[1]))) + elif self.context != CONTEXT_NONE: + self.content.append(self.token) + elif self.verbose: + print('todo 3: %d %s' % (self.token[0], repr(self.token[1]))) + elif self.token[0] == CHAR_WHITESPACE: + if self.context != CONTEXT_NONE: + self.content.append(self.token) + elif self.verbose: + print('todo 4: %d %s' % (self.token[0], repr(self.token[1]))) + else: + if self.context == CONTEXT_OBJ: + if self.token[1] == 'endobj': + self.oPDFElementIndirectObject = cPDFElementIndirectObject(self.objectId, self.objectVersion, self.content) + self.context = CONTEXT_NONE + self.content = [] + return self.oPDFElementIndirectObject + else: + self.content.append(self.token) + elif self.context == CONTEXT_TRAILER: + if self.token[1] == 'startxref' or self.token[1] == 'xref': + self.oPDFElementTrailer = cPDFElementTrailer(self.content) + self.oPDFTokenizer.unget(self.token) + self.context = CONTEXT_NONE + self.content = [] + return self.oPDFElementTrailer + else: + self.content.append(self.token) + elif self.context == CONTEXT_XREF: + if self.token[1] == 'trailer' or self.token[1] == 'xref': + self.oPDFElementXref = cPDFElementXref(self.content) + self.oPDFTokenizer.unget(self.token) + self.context = CONTEXT_NONE + self.content = [] + return self.oPDFElementXref + else: + self.content.append(self.token) + else: + if IsNumeric(self.token[1]): + self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace() + if IsNumeric(self.token2[1]): + self.token3 = self.oPDFTokenizer.TokenIgnoreWhiteSpace() + if self.token3[1] == 'obj': + self.objectId = eval(self.token[1]) + self.objectVersion = eval(self.token2[1]) + self.context = CONTEXT_OBJ + else: + self.oPDFTokenizer.unget(self.token3) + self.oPDFTokenizer.unget(self.token2) + if self.verbose: + print('todo 6: %d %s' % (self.token[0], repr(self.token[1]))) + else: + self.oPDFTokenizer.unget(self.token2) + if self.verbose: + print('todo 7: %d %s' % (self.token[0], repr(self.token[1]))) + elif self.token[1] == 'trailer': + self.context = CONTEXT_TRAILER + self.content = [self.token] + elif self.token[1] == 'xref': + self.context = CONTEXT_XREF + self.content = [self.token] + elif self.token[1] == 'startxref': + self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace() + if self.token2 and IsNumeric(self.token2[1]): + return cPDFElementStartxref(eval(self.token2[1])) + else: + self.oPDFTokenizer.unget(self.token2) + if self.verbose: + print('todo 9: %d %s' % (self.token[0], repr(self.token[1]))) + elif self.extract: + self.bytes = '' + while self.token: + self.bytes += self.token[1] + self.token = self.oPDFTokenizer.Token() + return cPDFElementMalformed(self.bytes) + elif self.verbose: + print('todo 10: %d %s' % (self.token[0], repr(self.token[1]))) + else: + break + +class cPDFElementComment: + def __init__(self, comment): + self.type = PDF_ELEMENT_COMMENT + self.comment = comment +# if re.match('^%PDF-[0-9]\.[0-9]', self.token[1]): +# print(repr(self.token[1])) +# elif re.match('^%%EOF', self.token[1]): +# print(repr(self.token[1])) + +class cPDFElementXref: + def __init__(self, content): + self.type = PDF_ELEMENT_XREF + self.content = content + +class cPDFElementTrailer: + def __init__(self, content): + self.type = PDF_ELEMENT_TRAILER + self.content = content + +def IIf(expr, truepart, falsepart): + if expr: + return truepart + else: + return falsepart + +class cPDFElementIndirectObject: + def __init__(self, id, version, content): + self.type = PDF_ELEMENT_INDIRECT_OBJECT + self.id = id + self.version = version + self.content = content + + def GetType(self): + content = CopyWithoutWhiteSpace(self.content) + dictionary = 0 + for i in range(0, len(content)): + if content[i][0] == CHAR_DELIMITER and content[i][1] == '<<': + dictionary += 1 + if content[i][0] == CHAR_DELIMITER and content[i][1] == '>>': + dictionary -= 1 + if dictionary == 1 and content[i][0] == CHAR_DELIMITER and EqualCanonical(content[i][1], '/Type') and i < len(content) - 1: + return content[i+1][1] + return '' + + def GetReferences(self): + content = CopyWithoutWhiteSpace(self.content) + references = [] + for i in range(0, len(content)): + if i > 1 and content[i][0] == CHAR_REGULAR and content[i][1] == 'R' and content[i-2][0] == CHAR_REGULAR and IsNumeric(content[i-2][1]) and content[i-1][0] == CHAR_REGULAR and IsNumeric(content[i-1][1]): + references.append((content[i-2][1], content[i-1][1], content[i][1])) + return references + + def References(self, index): + for ref in self.GetReferences(): + if ref[0] == index: + return True + return False + + def ContainsStream(self): + for i in range(0, len(self.content)): + if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream': + return self.content[0:i] + return False + + def Contains(self, keyword): + data = '' + for i in range(0, len(self.content)): + if self.content[i][1] == 'stream': + break + else: + data += Canonicalize(self.content[i][1]) + return data.upper().find(keyword.upper()) != -1 + + def StreamContains(self, keyword, filter, casesensitive, regex): + if not self.ContainsStream(): + return False + streamData = self.Stream(filter) + if filter and streamData == 'No filters': + streamData = self.Stream(False) + if regex: + return re.search(keyword, streamData, IIf(casesensitive, 0, re.I)) + elif casesensitive: + return keyword in streamData + else: + return keyword.lower() in streamData.lower() + + def Stream(self, filter=True): + state = 'start' + countDirectories = 0 + data = '' + filters = [] + for i in range(0, len(self.content)): + if state == 'start': + if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '<<': + countDirectories += 1 + if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '>>': + countDirectories -= 1 + if countDirectories == 1 and self.content[i][0] == CHAR_DELIMITER and EqualCanonical(self.content[i][1], '/Filter'): + state = 'filter' + elif countDirectories == 0 and self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream': + state = 'stream-whitespace' + elif state == 'filter': + if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/': + filters = [self.content[i][1]] + state = 'search-stream' + elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '[': + state = 'filter-list' + elif state == 'filter-list': + if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/': + filters.append(self.content[i][1]) + elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == ']': + state = 'search-stream' + elif state == 'search-stream': + if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream': + state = 'stream-whitespace' + elif state == 'stream-whitespace': + if self.content[i][0] != CHAR_WHITESPACE: + data += self.content[i][1] + state = 'stream-concat' + elif state == 'stream-concat': + if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'endstream': + if filter: + return self.Decompress(data, filters) + else: + return data + else: + data += self.content[i][1] + else: + return 'Unexpected filter state' + return filters + + def Decompress(self, data, filters): + for filter in filters: + if EqualCanonical(filter, '/FlateDecode') or EqualCanonical(filter, '/Fl'): + try: + data = FlateDecode(data) + except zlib.error, e: + message = 'FlateDecode decompress failed' + if len(data) > 0 and ord(data[0]) & 0x0F != 8: + message += ', unexpected compression method: %02x' % ord(data[0]) + return message + '. zlib.error %s' % e.message + elif EqualCanonical(filter, '/ASCIIHexDecode') or EqualCanonical(filter, '/AHx'): + try: + data = ASCIIHexDecode(data) + except: + return 'ASCIIHexDecode decompress failed' + elif EqualCanonical(filter, '/ASCII85Decode') or EqualCanonical(filter, '/A85'): + try: + data = ASCII85Decode(data.rstrip('>')) + except: + return 'ASCII85Decode decompress failed' + elif EqualCanonical(filter, '/LZWDecode') or EqualCanonical(filter, '/LZW'): + try: + data = LZWDecode(data) + except: + return 'LZWDecode decompress failed' + elif EqualCanonical(filter, '/RunLengthDecode') or EqualCanonical(filter, '/R'): + try: + data = RunLengthDecode(data) + except: + return 'RunLengthDecode decompress failed' +# elif i.startswith('/CC') # CCITTFaxDecode +# elif i.startswith('/DCT') # DCTDecode + else: + return 'Unsupported filter: %s' % repr(filters) + if len(filters) == 0: + return 'No filters' + else: + return data + +class cPDFElementStartxref: + def __init__(self, index): + self.type = PDF_ELEMENT_STARTXREF + self.index = index + +class cPDFElementMalformed: + def __init__(self, content): + self.type = PDF_ELEMENT_MALFORMED + self.content = content + +def TrimLWhiteSpace(data): + while data != [] and data[0][0] == CHAR_WHITESPACE: + data = data[1:] + return data + +def TrimRWhiteSpace(data): + while data != [] and data[-1][0] == CHAR_WHITESPACE: + data = data[:-1] + return data + +class cPDFParseDictionary: + def __init__(self, content, nocanonicalizedoutput): + self.content = content + self.nocanonicalizedoutput = nocanonicalizedoutput + dataTrimmed = TrimLWhiteSpace(TrimRWhiteSpace(self.content)) + if dataTrimmed == []: + self.parsed = None + elif self.isOpenDictionary(dataTrimmed[0]) and self.isCloseDictionary(dataTrimmed[-1]): + self.parsed = self.ParseDictionary(dataTrimmed)[0] + else: + self.parsed = None + + def isOpenDictionary(self, token): + return token[0] == CHAR_DELIMITER and token[1] == '<<' + + def isCloseDictionary(self, token): + return token[0] == CHAR_DELIMITER and token[1] == '>>' + + def ParseDictionary(self, tokens): + state = 0 # start + dictionary = [] + while tokens != []: + if state == 0: + if self.isOpenDictionary(tokens[0]): + state = 1 + else: + return None, tokens + elif state == 1: + if self.isOpenDictionary(tokens[0]): + pass + elif self.isCloseDictionary(tokens[0]): + return dictionary, tokens + elif tokens[0][0] != CHAR_WHITESPACE: + key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput) + value = [] + state = 2 + elif state == 2: + if self.isOpenDictionary(tokens[0]): + value, tokens = self.ParseDictionary(tokens) + dictionary.append((key, value)) + state = 1 + elif self.isCloseDictionary(tokens[0]): + dictionary.append((key, value)) + return dictionary, tokens + elif value == [] and tokens[0][0] == CHAR_WHITESPACE: + pass + elif value == [] and tokens[0][1] == '[': + value.append(tokens[0][1]) + elif value != [] and value[0] == '[' and tokens[0][1] != ']': + value.append(tokens[0][1]) + elif value != [] and value[0] == '[' and tokens[0][1] == ']': + value.append(tokens[0][1]) + dictionary.append((key, value)) + value = [] + state = 1 + elif value != [] and tokens[0][1][0] == '/': + dictionary.append((key, value)) + key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput) + value = [] + state = 2 + else: + value.append(ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput)) + tokens = tokens[1:] + + def retrieve(self): + return self.parsed + + def PrettyPrintSub(self, prefix, dictionary): + if dictionary != None: + print('%s<<' % prefix) + for e in dictionary: + if e[1] == []: + print('%s %s' % (prefix, e[0])) + elif type(e[1][0]) == type(''): + value = ''.join(e[1]).strip() + reprValue = repr(value) + if "'" + value + "'" != reprValue: + value = reprValue + print('%s %s %s' % (prefix, e[0], value)) + else: + print('%s %s' % (prefix, e[0])) + self.PrettyPrintSub(prefix + ' ', e[1]) + print('%s>>' % prefix) + + def PrettyPrint(self, prefix): + self.PrettyPrintSub(prefix, self.parsed) + +def FormatOutput(data, raw): + if raw: + if type(data) == type([]): + return ''.join(map(lambda x: x[1], data)) + else: + return data + else: + return repr(data) + +def PrintObject(object, options): + print('obj %d %d' % (object.id, object.version)) + print(' Type: %s' % ConditionalCanonicalize(object.GetType(), options.nocanonicalizedoutput)) + print(' Referencing: %s' % ', '.join(map(lambda x: '%s %s %s' % x, object.GetReferences()))) + dataPrecedingStream = object.ContainsStream() + oPDFParseDictionary = None + if dataPrecedingStream: + print(' Contains stream') + if options.debug: + print(' %s' % FormatOutput(dataPrecedingStream, options.raw)) + oPDFParseDictionary = cPDFParseDictionary(dataPrecedingStream, options.nocanonicalizedoutput) + else: + if options.debug or options.raw: + print(' %s' % FormatOutput(object.content, options.raw)) + oPDFParseDictionary = cPDFParseDictionary(object.content, options.nocanonicalizedoutput) + print('') + oPDFParseDictionary.PrettyPrint(' ') + print('') + if options.filter and not options.dump: + filtered = object.Stream() + if filtered == []: + print(' %s' % FormatOutput(object.content, options.raw)) + else: + print(' %s' % FormatOutput(filtered, options.raw)) + if options.content: + if object.ContainsStream(): + stream = object.Stream(False) + if stream != []: + print(' %s' % FormatOutput(stream, options.raw)) + else: + print(''.join([token[1] for token in object.content])) + + + if options.dump: + filtered = object.Stream(options.filter == True) + if filtered == []: + filtered = '' + try: + fDump = open(options.dump, 'wb') + try: + fDump.write(C2BIP3(filtered)) + except: + print('Error writing file %s' % options.dump) + fDump.close() + except: + print('Error writing file %s' % options.dump) + print('') + return + +def Canonicalize(sIn): + if sIn == '': + return sIn + elif sIn[0] != '/': + return sIn + elif sIn.find('#') == -1: + return sIn + else: + i = 0 + iLen = len(sIn) + sCanonical = '' + while i < iLen: + if sIn[i] == '#' and i < iLen - 2: + try: + sCanonical += chr(int(sIn[i+1:i+3], 16)) + i += 2 + except: + sCanonical += sIn[i] + else: + sCanonical += sIn[i] + i += 1 + return sCanonical + +def EqualCanonical(s1, s2): + return Canonicalize(s1) == s2 + +def ConditionalCanonicalize(sIn, nocanonicalizedoutput): + if nocanonicalizedoutput: + return sIn + else: + return Canonicalize(sIn) + +# http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer/pdfminer/ascii85.py +def ASCII85Decode(data): + import struct + n = b = 0 + out = '' + for c in data: + if '!' <= c and c <= 'u': + n += 1 + b = b*85+(ord(c)-33) + if n == 5: + out += struct.pack('>L',b) + n = b = 0 + elif c == 'z': + assert n == 0 + out += '\0\0\0\0' + elif c == '~': + if n: + for _ in range(5-n): + b = b*85+84 + out += struct.pack('>L',b)[:n-1] + break + return out + +def ASCIIHexDecode(data): + return binascii.unhexlify(''.join([c for c in data if c not in ' \t\n\r']).rstrip('>')) + +def FlateDecode(data): + return zlib.decompress(data) + +def RunLengthDecode(data): + f = StringIO(data) + decompressed = '' + runLength = ord(f.read(1)) + while runLength: + if runLength < 128: + decompressed += f.read(runLength + 1) + if runLength > 128: + decompressed += f.read(1) * (257 - runLength) + if runLength == 128: + break + runLength = ord(f.read(1)) +# return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)), data) + return decompressed + +#### LZW code sourced from pdfminer +# Copyright (c) 2004-2009 Yusuke Shinyama +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +class LZWDecoder(object): + def __init__(self, fp): + self.fp = fp + self.buff = 0 + self.bpos = 8 + self.nbits = 9 + self.table = None + self.prevbuf = None + return + + def readbits(self, bits): + v = 0 + while 1: + # the number of remaining bits we can get from the current buffer. + r = 8-self.bpos + if bits <= r: + # |-----8-bits-----| + # |-bpos-|-bits-| | + # | |----r----| + v = (v<>(r-bits)) & ((1< __maximum_python_version__: + if enforceMaximumVersion: + print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3]) + print('Please use Python version %d.%d.%d' % __maximum_python_version__) + sys.exit() + else: + print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3]) + print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__) + if sys.version_info[0:3] < __minimum_python_version__: + if enforceMinimumVersion: + print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3]) + print('Please use Python version %d.%d.%d' % __maximum_python_version__) + sys.exit() + else: + print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3]) + print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__) + +if __name__ == '__main__': + TestPythonVersion() + Main()