2016-05-27 14:43:15 +00:00
#!/usr/bin/python2
2015-03-15 12:25:47 +00:00
__description__ = ' pdf-parser, use it to parse a PDF document '
__author__ = ' Didier Stevens '
__version__ = ' 0.4.3 '
__date__ = ' 2013/09/18 '
__minimum_python_version__ = ( 2 , 5 , 1 )
__maximum_python_version__ = ( 3 , 3 , 0 )
"""
Source code put in public domain by Didier Stevens , no Copyright
https : / / DidierStevens . com
Use at your own risk
History :
2008 / 05 / 02 : continue
2008 / 05 / 03 : continue
2008 / 06 / 02 : streams
2008 / 10 / 19 : refactor , grep & extract functionality
2008 / 10 / 20 : reference
2008 / 10 / 21 : cleanup
2008 / 11 / 12 : V0 .3 dictionary parser
2008 / 11 / 13 : option elements
2008 / 11 / 14 : continue
2009 / 05 / 05 : added / ASCIIHexDecode support ( thanks Justin Prosco )
2009 / 05 / 11 : V0 .3 .1 updated usage , added - - verbose and - - extract
2009 / 07 / 16 : V0 .3 .2 Added Canonicalize ( thanks Justin Prosco )
2009 / 07 / 18 : bugfix EqualCanonical
2009 / 07 / 24 : V0 .3 .3 Added - - hash option
2009 / 07 / 25 : EqualCanonical for option - - type , added option - - nocanonicalizedoutput
2009 / 07 / 28 : V0 .3 .4 Added ASCII85Decode support
2009 / 08 / 01 : V0 .3 .5 Updated ASCIIHexDecode to support whitespace obfuscation
2009 / 08 / 30 : V0 .3 .6 TestPythonVersion
2010 / 01 / 08 : V0 .3 .7 Added RLE and LZW support ( thanks pARODY ) ; added dump option
2010 / 01 / 09 : Fixed parsing of incomplete startxref
2010 / 09 / 22 : V0 .3 .8 Changed dump option , updated PrettyPrint , added debug option
2011 / 12 / 17 : fixed bugs empty objects
2012 / 03 / 11 : V0 .3 .9 fixed bugs double nested [ ] in PrettyPrintSub ( thanks kurt )
2013 / 01 / 11 : V0 .3 .10 Extract and dump bug fixes by Priit ; added content option
2013 / 02 / 16 : Performance improvement in cPDFTokenizer by using StringIO for token building by Christophe Vandeplas ; xrange replaced with range
2013 / 02 / 16 : V0 .4 .0 added http / https support ; added error handling for missing file or URL ; ; added support for ZIP file with password ' infected '
2013 / 03 / 13 : V0 .4 .1 fixes for Python 3
2013 / 04 / 11 : V0 .4 .2 modified PrettyPrintSub for strings with unprintable characters
2013 / 05 / 04 : Added options searchstream , unfiltered , casesensitive , regex
2013 / 09 / 18 : V0 .4 .3 fixed regression bug - w option
Todo :
- handle printf todo
- fix PrettyPrint
- support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D . pdf . vir
"""
import re
import optparse
import zlib
import binascii
import hashlib
import sys
import zipfile
if sys . version_info [ 0 ] > = 3 :
from io import StringIO
import urllib . request
urllib23 = urllib . request
else :
from cStringIO import StringIO
import urllib2
urllib23 = urllib2
CHAR_WHITESPACE = 1
CHAR_DELIMITER = 2
CHAR_REGULAR = 3
CONTEXT_NONE = 1
CONTEXT_OBJ = 2
CONTEXT_XREF = 3
CONTEXT_TRAILER = 4
PDF_ELEMENT_COMMENT = 1
PDF_ELEMENT_INDIRECT_OBJECT = 2
PDF_ELEMENT_XREF = 3
PDF_ELEMENT_TRAILER = 4
PDF_ELEMENT_STARTXREF = 5
PDF_ELEMENT_MALFORMED = 6
#Convert 2 Bytes If Python 3
def C2BIP3 ( string ) :
if sys . version_info [ 0 ] > 2 :
return bytes ( [ ord ( x ) for x in string ] )
else :
return string
def CopyWithoutWhiteSpace ( content ) :
result = [ ]
for token in content :
if token [ 0 ] != CHAR_WHITESPACE :
result . append ( token )
return result
def Obj2Str ( content ) :
return ' ' . join ( map ( lambda x : repr ( x [ 1 ] ) [ 1 : - 1 ] , CopyWithoutWhiteSpace ( content ) ) )
class cPDFDocument :
def __init__ ( self , file ) :
self . file = file
if file . lower ( ) . startswith ( ' http:// ' ) or file . lower ( ) . startswith ( ' https:// ' ) :
try :
if sys . hexversion > = 0x020601F0 :
self . infile = urllib23 . urlopen ( file , timeout = 5 )
else :
self . infile = urllib23 . urlopen ( file )
except urllib23 . HTTPError :
print ( ' Error accessing URL %s ' % file )
print ( sys . exc_info ( ) [ 1 ] )
sys . exit ( )
elif file . lower ( ) . endswith ( ' .zip ' ) :
try :
self . zipfile = zipfile . ZipFile ( file , ' r ' )
self . infile = self . zipfile . open ( self . zipfile . infolist ( ) [ 0 ] , ' r ' , C2BIP3 ( ' infected ' ) )
except :
print ( ' Error opening file %s ' % file )
print ( sys . exc_info ( ) [ 1 ] )
sys . exit ( )
else :
try :
self . infile = open ( file , ' rb ' )
except :
print ( ' Error opening file %s ' % file )
print ( sys . exc_info ( ) [ 1 ] )
sys . exit ( )
self . ungetted = [ ]
self . position = - 1
def byte ( self ) :
if len ( self . ungetted ) != 0 :
self . position + = 1
return self . ungetted . pop ( )
inbyte = self . infile . read ( 1 )
if not inbyte or inbyte == ' ' :
self . infile . close ( )
return None
self . position + = 1
return ord ( inbyte )
def unget ( self , byte ) :
self . position - = 1
self . ungetted . append ( byte )
def CharacterClass ( byte ) :
if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32 :
return CHAR_WHITESPACE
if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25 :
return CHAR_DELIMITER
return CHAR_REGULAR
def IsNumeric ( str ) :
return re . match ( ' ^[0-9]+ ' , str )
class cPDFTokenizer :
def __init__ ( self , file ) :
self . oPDF = cPDFDocument ( file )
self . ungetted = [ ]
def Token ( self ) :
if len ( self . ungetted ) != 0 :
return self . ungetted . pop ( )
if self . oPDF == None :
return None
self . byte = self . oPDF . byte ( )
if self . byte == None :
self . oPDF = None
return None
elif CharacterClass ( self . byte ) == CHAR_WHITESPACE :
file_str = StringIO ( )
while self . byte != None and CharacterClass ( self . byte ) == CHAR_WHITESPACE :
file_str . write ( chr ( self . byte ) )
self . byte = self . oPDF . byte ( )
if self . byte != None :
self . oPDF . unget ( self . byte )
else :
self . oPDF = None
self . token = file_str . getvalue ( )
return ( CHAR_WHITESPACE , self . token )
elif CharacterClass ( self . byte ) == CHAR_REGULAR :
file_str = StringIO ( )
while self . byte != None and CharacterClass ( self . byte ) == CHAR_REGULAR :
file_str . write ( chr ( self . byte ) )
self . byte = self . oPDF . byte ( )
if self . byte != None :
self . oPDF . unget ( self . byte )
else :
self . oPDF = None
self . token = file_str . getvalue ( )
return ( CHAR_REGULAR , self . token )
else :
if self . byte == 0x3C :
self . byte = self . oPDF . byte ( )
if self . byte == 0x3C :
return ( CHAR_DELIMITER , ' << ' )
else :
self . oPDF . unget ( self . byte )
return ( CHAR_DELIMITER , ' < ' )
elif self . byte == 0x3E :
self . byte = self . oPDF . byte ( )
if self . byte == 0x3E :
return ( CHAR_DELIMITER , ' >> ' )
else :
self . oPDF . unget ( self . byte )
return ( CHAR_DELIMITER , ' > ' )
elif self . byte == 0x25 :
file_str = StringIO ( )
while self . byte != None :
file_str . write ( chr ( self . byte ) )
if self . byte == 10 or self . byte == 13 :
self . byte = self . oPDF . byte ( )
break
self . byte = self . oPDF . byte ( )
if self . byte != None :
if self . byte == 10 :
file_str . write ( chr ( self . byte ) )
else :
self . oPDF . unget ( self . byte )
else :
self . oPDF = None
self . token = file_str . getvalue ( )
return ( CHAR_DELIMITER , self . token )
return ( CHAR_DELIMITER , chr ( self . byte ) )
def TokenIgnoreWhiteSpace ( self ) :
token = self . Token ( )
while token != None and token [ 0 ] == CHAR_WHITESPACE :
token = self . Token ( )
return token
def unget ( self , byte ) :
self . ungetted . append ( byte )
class cPDFParser :
def __init__ ( self , file , verbose = False , extract = None ) :
self . context = CONTEXT_NONE
self . content = [ ]
self . oPDFTokenizer = cPDFTokenizer ( file )
self . verbose = verbose
self . extract = extract
def GetObject ( self ) :
while True :
if self . context == CONTEXT_OBJ :
self . token = self . oPDFTokenizer . Token ( )
else :
self . token = self . oPDFTokenizer . TokenIgnoreWhiteSpace ( )
if self . token :
if self . token [ 0 ] == CHAR_DELIMITER :
if self . token [ 1 ] [ 0 ] == ' % ' :
if self . context == CONTEXT_OBJ :
self . content . append ( self . token )
else :
return cPDFElementComment ( self . token [ 1 ] )
elif self . token [ 1 ] == ' / ' :
self . token2 = self . oPDFTokenizer . Token ( )
if self . token2 [ 0 ] == CHAR_REGULAR :
if self . context != CONTEXT_NONE :
self . content . append ( ( CHAR_DELIMITER , self . token [ 1 ] + self . token2 [ 1 ] ) )
elif self . verbose :
print ( ' todo 1: %s ' % ( self . token [ 1 ] + self . token2 [ 1 ] ) )
else :
self . oPDFTokenizer . unget ( self . token2 )
if self . context != CONTEXT_NONE :
self . content . append ( self . token )
elif self . verbose :
print ( ' todo 2: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
elif self . context != CONTEXT_NONE :
self . content . append ( self . token )
elif self . verbose :
print ( ' todo 3: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
elif self . token [ 0 ] == CHAR_WHITESPACE :
if self . context != CONTEXT_NONE :
self . content . append ( self . token )
elif self . verbose :
print ( ' todo 4: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
else :
if self . context == CONTEXT_OBJ :
if self . token [ 1 ] == ' endobj ' :
self . oPDFElementIndirectObject = cPDFElementIndirectObject ( self . objectId , self . objectVersion , self . content )
self . context = CONTEXT_NONE
self . content = [ ]
return self . oPDFElementIndirectObject
else :
self . content . append ( self . token )
elif self . context == CONTEXT_TRAILER :
if self . token [ 1 ] == ' startxref ' or self . token [ 1 ] == ' xref ' :
self . oPDFElementTrailer = cPDFElementTrailer ( self . content )
self . oPDFTokenizer . unget ( self . token )
self . context = CONTEXT_NONE
self . content = [ ]
return self . oPDFElementTrailer
else :
self . content . append ( self . token )
elif self . context == CONTEXT_XREF :
if self . token [ 1 ] == ' trailer ' or self . token [ 1 ] == ' xref ' :
self . oPDFElementXref = cPDFElementXref ( self . content )
self . oPDFTokenizer . unget ( self . token )
self . context = CONTEXT_NONE
self . content = [ ]
return self . oPDFElementXref
else :
self . content . append ( self . token )
else :
if IsNumeric ( self . token [ 1 ] ) :
self . token2 = self . oPDFTokenizer . TokenIgnoreWhiteSpace ( )
if IsNumeric ( self . token2 [ 1 ] ) :
self . token3 = self . oPDFTokenizer . TokenIgnoreWhiteSpace ( )
if self . token3 [ 1 ] == ' obj ' :
self . objectId = eval ( self . token [ 1 ] )
self . objectVersion = eval ( self . token2 [ 1 ] )
self . context = CONTEXT_OBJ
else :
self . oPDFTokenizer . unget ( self . token3 )
self . oPDFTokenizer . unget ( self . token2 )
if self . verbose :
print ( ' todo 6: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
else :
self . oPDFTokenizer . unget ( self . token2 )
if self . verbose :
print ( ' todo 7: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
elif self . token [ 1 ] == ' trailer ' :
self . context = CONTEXT_TRAILER
self . content = [ self . token ]
elif self . token [ 1 ] == ' xref ' :
self . context = CONTEXT_XREF
self . content = [ self . token ]
elif self . token [ 1 ] == ' startxref ' :
self . token2 = self . oPDFTokenizer . TokenIgnoreWhiteSpace ( )
if self . token2 and IsNumeric ( self . token2 [ 1 ] ) :
return cPDFElementStartxref ( eval ( self . token2 [ 1 ] ) )
else :
self . oPDFTokenizer . unget ( self . token2 )
if self . verbose :
print ( ' todo 9: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
elif self . extract :
self . bytes = ' '
while self . token :
self . bytes + = self . token [ 1 ]
self . token = self . oPDFTokenizer . Token ( )
return cPDFElementMalformed ( self . bytes )
elif self . verbose :
print ( ' todo 10: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
else :
break
class cPDFElementComment :
def __init__ ( self , comment ) :
self . type = PDF_ELEMENT_COMMENT
self . comment = comment
# if re.match('^%PDF-[0-9]\.[0-9]', self.token[1]):
# print(repr(self.token[1]))
# elif re.match('^%%EOF', self.token[1]):
# print(repr(self.token[1]))
class cPDFElementXref :
def __init__ ( self , content ) :
self . type = PDF_ELEMENT_XREF
self . content = content
class cPDFElementTrailer :
def __init__ ( self , content ) :
self . type = PDF_ELEMENT_TRAILER
self . content = content
def IIf ( expr , truepart , falsepart ) :
if expr :
return truepart
else :
return falsepart
class cPDFElementIndirectObject :
def __init__ ( self , id , version , content ) :
self . type = PDF_ELEMENT_INDIRECT_OBJECT
self . id = id
self . version = version
self . content = content
def GetType ( self ) :
content = CopyWithoutWhiteSpace ( self . content )
dictionary = 0
for i in range ( 0 , len ( content ) ) :
if content [ i ] [ 0 ] == CHAR_DELIMITER and content [ i ] [ 1 ] == ' << ' :
dictionary + = 1
if content [ i ] [ 0 ] == CHAR_DELIMITER and content [ i ] [ 1 ] == ' >> ' :
dictionary - = 1
if dictionary == 1 and content [ i ] [ 0 ] == CHAR_DELIMITER and EqualCanonical ( content [ i ] [ 1 ] , ' /Type ' ) and i < len ( content ) - 1 :
return content [ i + 1 ] [ 1 ]
return ' '
def GetReferences ( self ) :
content = CopyWithoutWhiteSpace ( self . content )
references = [ ]
for i in range ( 0 , len ( content ) ) :
if i > 1 and content [ i ] [ 0 ] == CHAR_REGULAR and content [ i ] [ 1 ] == ' R ' and content [ i - 2 ] [ 0 ] == CHAR_REGULAR and IsNumeric ( content [ i - 2 ] [ 1 ] ) and content [ i - 1 ] [ 0 ] == CHAR_REGULAR and IsNumeric ( content [ i - 1 ] [ 1 ] ) :
references . append ( ( content [ i - 2 ] [ 1 ] , content [ i - 1 ] [ 1 ] , content [ i ] [ 1 ] ) )
return references
def References ( self , index ) :
for ref in self . GetReferences ( ) :
if ref [ 0 ] == index :
return True
return False
def ContainsStream ( self ) :
for i in range ( 0 , len ( self . content ) ) :
if self . content [ i ] [ 0 ] == CHAR_REGULAR and self . content [ i ] [ 1 ] == ' stream ' :
return self . content [ 0 : i ]
return False
def Contains ( self , keyword ) :
data = ' '
for i in range ( 0 , len ( self . content ) ) :
if self . content [ i ] [ 1 ] == ' stream ' :
break
else :
data + = Canonicalize ( self . content [ i ] [ 1 ] )
return data . upper ( ) . find ( keyword . upper ( ) ) != - 1
def StreamContains ( self , keyword , filter , casesensitive , regex ) :
if not self . ContainsStream ( ) :
return False
streamData = self . Stream ( filter )
if filter and streamData == ' No filters ' :
streamData = self . Stream ( False )
if regex :
return re . search ( keyword , streamData , IIf ( casesensitive , 0 , re . I ) )
elif casesensitive :
return keyword in streamData
else :
return keyword . lower ( ) in streamData . lower ( )
def Stream ( self , filter = True ) :
state = ' start '
countDirectories = 0
data = ' '
filters = [ ]
for i in range ( 0 , len ( self . content ) ) :
if state == ' start ' :
if self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] == ' << ' :
countDirectories + = 1
if self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] == ' >> ' :
countDirectories - = 1
if countDirectories == 1 and self . content [ i ] [ 0 ] == CHAR_DELIMITER and EqualCanonical ( self . content [ i ] [ 1 ] , ' /Filter ' ) :
state = ' filter '
elif countDirectories == 0 and self . content [ i ] [ 0 ] == CHAR_REGULAR and self . content [ i ] [ 1 ] == ' stream ' :
state = ' stream-whitespace '
elif state == ' filter ' :
if self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] [ 0 ] == ' / ' :
filters = [ self . content [ i ] [ 1 ] ]
state = ' search-stream '
elif self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] == ' [ ' :
state = ' filter-list '
elif state == ' filter-list ' :
if self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] [ 0 ] == ' / ' :
filters . append ( self . content [ i ] [ 1 ] )
elif self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] == ' ] ' :
state = ' search-stream '
elif state == ' search-stream ' :
if self . content [ i ] [ 0 ] == CHAR_REGULAR and self . content [ i ] [ 1 ] == ' stream ' :
state = ' stream-whitespace '
elif state == ' stream-whitespace ' :
if self . content [ i ] [ 0 ] != CHAR_WHITESPACE :
data + = self . content [ i ] [ 1 ]
state = ' stream-concat '
elif state == ' stream-concat ' :
if self . content [ i ] [ 0 ] == CHAR_REGULAR and self . content [ i ] [ 1 ] == ' endstream ' :
if filter :
return self . Decompress ( data , filters )
else :
return data
else :
data + = self . content [ i ] [ 1 ]
else :
return ' Unexpected filter state '
return filters
def Decompress ( self , data , filters ) :
for filter in filters :
if EqualCanonical ( filter , ' /FlateDecode ' ) or EqualCanonical ( filter , ' /Fl ' ) :
try :
data = FlateDecode ( data )
except zlib . error , e :
message = ' FlateDecode decompress failed '
if len ( data ) > 0 and ord ( data [ 0 ] ) & 0x0F != 8 :
message + = ' , unexpected compression method: %02x ' % ord ( data [ 0 ] )
return message + ' . zlib.error %s ' % e . message
elif EqualCanonical ( filter , ' /ASCIIHexDecode ' ) or EqualCanonical ( filter , ' /AHx ' ) :
try :
data = ASCIIHexDecode ( data )
except :
return ' ASCIIHexDecode decompress failed '
elif EqualCanonical ( filter , ' /ASCII85Decode ' ) or EqualCanonical ( filter , ' /A85 ' ) :
try :
data = ASCII85Decode ( data . rstrip ( ' > ' ) )
except :
return ' ASCII85Decode decompress failed '
elif EqualCanonical ( filter , ' /LZWDecode ' ) or EqualCanonical ( filter , ' /LZW ' ) :
try :
data = LZWDecode ( data )
except :
return ' LZWDecode decompress failed '
elif EqualCanonical ( filter , ' /RunLengthDecode ' ) or EqualCanonical ( filter , ' /R ' ) :
try :
data = RunLengthDecode ( data )
except :
return ' RunLengthDecode decompress failed '
# elif i.startswith('/CC') # CCITTFaxDecode
# elif i.startswith('/DCT') # DCTDecode
else :
return ' Unsupported filter: %s ' % repr ( filters )
if len ( filters ) == 0 :
return ' No filters '
else :
return data
class cPDFElementStartxref :
def __init__ ( self , index ) :
self . type = PDF_ELEMENT_STARTXREF
self . index = index
class cPDFElementMalformed :
def __init__ ( self , content ) :
self . type = PDF_ELEMENT_MALFORMED
self . content = content
def TrimLWhiteSpace ( data ) :
while data != [ ] and data [ 0 ] [ 0 ] == CHAR_WHITESPACE :
data = data [ 1 : ]
return data
def TrimRWhiteSpace ( data ) :
while data != [ ] and data [ - 1 ] [ 0 ] == CHAR_WHITESPACE :
data = data [ : - 1 ]
return data
class cPDFParseDictionary :
def __init__ ( self , content , nocanonicalizedoutput ) :
self . content = content
self . nocanonicalizedoutput = nocanonicalizedoutput
dataTrimmed = TrimLWhiteSpace ( TrimRWhiteSpace ( self . content ) )
if dataTrimmed == [ ] :
self . parsed = None
elif self . isOpenDictionary ( dataTrimmed [ 0 ] ) and self . isCloseDictionary ( dataTrimmed [ - 1 ] ) :
self . parsed = self . ParseDictionary ( dataTrimmed ) [ 0 ]
else :
self . parsed = None
def isOpenDictionary ( self , token ) :
return token [ 0 ] == CHAR_DELIMITER and token [ 1 ] == ' << '
def isCloseDictionary ( self , token ) :
return token [ 0 ] == CHAR_DELIMITER and token [ 1 ] == ' >> '
def ParseDictionary ( self , tokens ) :
state = 0 # start
dictionary = [ ]
while tokens != [ ] :
if state == 0 :
if self . isOpenDictionary ( tokens [ 0 ] ) :
state = 1
else :
return None , tokens
elif state == 1 :
if self . isOpenDictionary ( tokens [ 0 ] ) :
pass
elif self . isCloseDictionary ( tokens [ 0 ] ) :
return dictionary , tokens
elif tokens [ 0 ] [ 0 ] != CHAR_WHITESPACE :
key = ConditionalCanonicalize ( tokens [ 0 ] [ 1 ] , self . nocanonicalizedoutput )
value = [ ]
state = 2
elif state == 2 :
if self . isOpenDictionary ( tokens [ 0 ] ) :
value , tokens = self . ParseDictionary ( tokens )
dictionary . append ( ( key , value ) )
state = 1
elif self . isCloseDictionary ( tokens [ 0 ] ) :
dictionary . append ( ( key , value ) )
return dictionary , tokens
elif value == [ ] and tokens [ 0 ] [ 0 ] == CHAR_WHITESPACE :
pass
elif value == [ ] and tokens [ 0 ] [ 1 ] == ' [ ' :
value . append ( tokens [ 0 ] [ 1 ] )
elif value != [ ] and value [ 0 ] == ' [ ' and tokens [ 0 ] [ 1 ] != ' ] ' :
value . append ( tokens [ 0 ] [ 1 ] )
elif value != [ ] and value [ 0 ] == ' [ ' and tokens [ 0 ] [ 1 ] == ' ] ' :
value . append ( tokens [ 0 ] [ 1 ] )
dictionary . append ( ( key , value ) )
value = [ ]
state = 1
elif value != [ ] and tokens [ 0 ] [ 1 ] [ 0 ] == ' / ' :
dictionary . append ( ( key , value ) )
key = ConditionalCanonicalize ( tokens [ 0 ] [ 1 ] , self . nocanonicalizedoutput )
value = [ ]
state = 2
else :
value . append ( ConditionalCanonicalize ( tokens [ 0 ] [ 1 ] , self . nocanonicalizedoutput ) )
tokens = tokens [ 1 : ]
def retrieve ( self ) :
return self . parsed
def PrettyPrintSub ( self , prefix , dictionary ) :
if dictionary != None :
print ( ' %s << ' % prefix )
for e in dictionary :
if e [ 1 ] == [ ] :
print ( ' %s %s ' % ( prefix , e [ 0 ] ) )
elif type ( e [ 1 ] [ 0 ] ) == type ( ' ' ) :
value = ' ' . join ( e [ 1 ] ) . strip ( )
reprValue = repr ( value )
if " ' " + value + " ' " != reprValue :
value = reprValue
print ( ' %s %s %s ' % ( prefix , e [ 0 ] , value ) )
else :
print ( ' %s %s ' % ( prefix , e [ 0 ] ) )
self . PrettyPrintSub ( prefix + ' ' , e [ 1 ] )
print ( ' %s >> ' % prefix )
def PrettyPrint ( self , prefix ) :
self . PrettyPrintSub ( prefix , self . parsed )
def FormatOutput ( data , raw ) :
if raw :
if type ( data ) == type ( [ ] ) :
return ' ' . join ( map ( lambda x : x [ 1 ] , data ) )
else :
return data
else :
return repr ( data )
def PrintObject ( object , options ) :
print ( ' obj %d %d ' % ( object . id , object . version ) )
print ( ' Type: %s ' % ConditionalCanonicalize ( object . GetType ( ) , options . nocanonicalizedoutput ) )
print ( ' Referencing: %s ' % ' , ' . join ( map ( lambda x : ' %s %s %s ' % x , object . GetReferences ( ) ) ) )
dataPrecedingStream = object . ContainsStream ( )
oPDFParseDictionary = None
if dataPrecedingStream :
print ( ' Contains stream ' )
if options . debug :
print ( ' %s ' % FormatOutput ( dataPrecedingStream , options . raw ) )
oPDFParseDictionary = cPDFParseDictionary ( dataPrecedingStream , options . nocanonicalizedoutput )
else :
if options . debug or options . raw :
print ( ' %s ' % FormatOutput ( object . content , options . raw ) )
oPDFParseDictionary = cPDFParseDictionary ( object . content , options . nocanonicalizedoutput )
print ( ' ' )
oPDFParseDictionary . PrettyPrint ( ' ' )
print ( ' ' )
if options . filter and not options . dump :
filtered = object . Stream ( )
if filtered == [ ] :
print ( ' %s ' % FormatOutput ( object . content , options . raw ) )
else :
print ( ' %s ' % FormatOutput ( filtered , options . raw ) )
if options . content :
if object . ContainsStream ( ) :
stream = object . Stream ( False )
if stream != [ ] :
print ( ' %s ' % FormatOutput ( stream , options . raw ) )
else :
print ( ' ' . join ( [ token [ 1 ] for token in object . content ] ) )
if options . dump :
filtered = object . Stream ( options . filter == True )
if filtered == [ ] :
filtered = ' '
try :
fDump = open ( options . dump , ' wb ' )
try :
fDump . write ( C2BIP3 ( filtered ) )
except :
print ( ' Error writing file %s ' % options . dump )
fDump . close ( )
except :
print ( ' Error writing file %s ' % options . dump )
print ( ' ' )
return
def Canonicalize ( sIn ) :
if sIn == ' ' :
return sIn
elif sIn [ 0 ] != ' / ' :
return sIn
elif sIn . find ( ' # ' ) == - 1 :
return sIn
else :
i = 0
iLen = len ( sIn )
sCanonical = ' '
while i < iLen :
if sIn [ i ] == ' # ' and i < iLen - 2 :
try :
sCanonical + = chr ( int ( sIn [ i + 1 : i + 3 ] , 16 ) )
i + = 2
except :
sCanonical + = sIn [ i ]
else :
sCanonical + = sIn [ i ]
i + = 1
return sCanonical
def EqualCanonical ( s1 , s2 ) :
return Canonicalize ( s1 ) == s2
def ConditionalCanonicalize ( sIn , nocanonicalizedoutput ) :
if nocanonicalizedoutput :
return sIn
else :
return Canonicalize ( sIn )
# http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer/pdfminer/ascii85.py
def ASCII85Decode ( data ) :
import struct
n = b = 0
out = ' '
for c in data :
if ' ! ' < = c and c < = ' u ' :
n + = 1
b = b * 85 + ( ord ( c ) - 33 )
if n == 5 :
out + = struct . pack ( ' >L ' , b )
n = b = 0
elif c == ' z ' :
assert n == 0
out + = ' \0 \0 \0 \0 '
elif c == ' ~ ' :
if n :
for _ in range ( 5 - n ) :
b = b * 85 + 84
out + = struct . pack ( ' >L ' , b ) [ : n - 1 ]
break
return out
def ASCIIHexDecode ( data ) :
return binascii . unhexlify ( ' ' . join ( [ c for c in data if c not in ' \t \n \r ' ] ) . rstrip ( ' > ' ) )
def FlateDecode ( data ) :
return zlib . decompress ( data )
def RunLengthDecode ( data ) :
f = StringIO ( data )
decompressed = ' '
runLength = ord ( f . read ( 1 ) )
while runLength :
if runLength < 128 :
decompressed + = f . read ( runLength + 1 )
if runLength > 128 :
decompressed + = f . read ( 1 ) * ( 257 - runLength )
if runLength == 128 :
break
runLength = ord ( f . read ( 1 ) )
# return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)), data)
return decompressed
#### LZW code sourced from pdfminer
# Copyright (c) 2004-2009 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
class LZWDecoder ( object ) :
def __init__ ( self , fp ) :
self . fp = fp
self . buff = 0
self . bpos = 8
self . nbits = 9
self . table = None
self . prevbuf = None
return
def readbits ( self , bits ) :
v = 0
while 1 :
# the number of remaining bits we can get from the current buffer.
r = 8 - self . bpos
if bits < = r :
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = ( v << bits ) | ( ( self . buff >> ( r - bits ) ) & ( ( 1 << bits ) - 1 ) )
self . bpos + = bits
break
else :
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = ( v << r ) | ( self . buff & ( ( 1 << r ) - 1 ) )
bits - = r
x = self . fp . read ( 1 )
if not x : raise EOFError
self . buff = ord ( x )
self . bpos = 0
return v
def feed ( self , code ) :
x = ' '
if code == 256 :
self . table = [ chr ( c ) for c in range ( 256 ) ] # 0-255
self . table . append ( None ) # 256
self . table . append ( None ) # 257
self . prevbuf = ' '
self . nbits = 9
elif code == 257 :
pass
elif not self . prevbuf :
x = self . prevbuf = self . table [ code ]
else :
if code < len ( self . table ) :
x = self . table [ code ]
self . table . append ( self . prevbuf + x [ 0 ] )
else :
self . table . append ( self . prevbuf + self . prevbuf [ 0 ] )
x = self . table [ code ]
l = len ( self . table )
if l == 511 :
self . nbits = 10
elif l == 1023 :
self . nbits = 11
elif l == 2047 :
self . nbits = 12
self . prevbuf = x
return x
def run ( self ) :
while 1 :
try :
code = self . readbits ( self . nbits )
except EOFError :
break
x = self . feed ( code )
yield x
return
####
def LZWDecode ( data ) :
return ' ' . join ( LZWDecoder ( StringIO ( data ) ) . run ( ) )
def Main ( ) :
""" pdf-parser, use it to parse a PDF document
"""
oParser = optparse . OptionParser ( usage = ' usage: % prog [options] pdf-file|zip-file|url \n ' + __description__ , version = ' % prog ' + __version__ )
oParser . add_option ( ' -s ' , ' --search ' , help = ' string to search in indirect objects (except streams) ' )
oParser . add_option ( ' -f ' , ' --filter ' , action = ' store_true ' , default = False , help = ' pass stream object through filters (FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode and RunLengthDecode only) ' )
oParser . add_option ( ' -o ' , ' --object ' , help = ' id of indirect object to select (version independent) ' )
oParser . add_option ( ' -r ' , ' --reference ' , help = ' id of indirect object being referenced (version independent) ' )
oParser . add_option ( ' -e ' , ' --elements ' , help = ' type of elements to select (cxtsi) ' )
oParser . add_option ( ' -w ' , ' --raw ' , action = ' store_true ' , default = False , help = ' raw output for data and filters ' )
oParser . add_option ( ' -a ' , ' --stats ' , action = ' store_true ' , default = False , help = ' display stats for pdf document ' )
oParser . add_option ( ' -t ' , ' --type ' , help = ' type of indirect object to select ' )
oParser . add_option ( ' -v ' , ' --verbose ' , action = ' store_true ' , default = False , help = ' display malformed PDF elements ' )
oParser . add_option ( ' -x ' , ' --extract ' , help = ' filename to extract malformed content to ' )
oParser . add_option ( ' -H ' , ' --hash ' , action = ' store_true ' , default = False , help = ' display hash of objects ' )
oParser . add_option ( ' -n ' , ' --nocanonicalizedoutput ' , action = ' store_true ' , default = False , help = ' do not canonicalize the output ' )
oParser . add_option ( ' -d ' , ' --dump ' , help = ' filename to dump stream content to ' )
oParser . add_option ( ' -D ' , ' --debug ' , action = ' store_true ' , default = False , help = ' display debug info ' )
oParser . add_option ( ' -c ' , ' --content ' , action = ' store_true ' , default = False , help = ' display the content for objects without streams or with streams without filters ' )
oParser . add_option ( ' --searchstream ' , help = ' string to search in streams ' )
oParser . add_option ( ' --unfiltered ' , action = ' store_true ' , default = False , help = ' search in unfiltered streams ' )
oParser . add_option ( ' --casesensitive ' , action = ' store_true ' , default = False , help = ' case sensitive search in streams ' )
oParser . add_option ( ' --regex ' , action = ' store_true ' , default = False , help = ' use regex to search in streams ' )
( options , args ) = oParser . parse_args ( )
if len ( args ) != 1 :
oParser . print_help ( )
print ( ' ' )
print ( ' %s ' % __description__ )
print ( ' Source code put in the public domain by Didier Stevens, no Copyright ' )
print ( ' Use at your own risk ' )
print ( ' https://DidierStevens.com ' )
else :
oPDFParser = cPDFParser ( args [ 0 ] , options . verbose , options . extract )
cntComment = 0
cntXref = 0
cntTrailer = 0
cntStartXref = 0
cntIndirectObject = 0
dicObjectTypes = { }
selectComment = False
selectXref = False
selectTrailer = False
selectStartXref = False
selectIndirectObject = False
if options . elements :
for c in options . elements :
if c == ' c ' :
selectComment = True
elif c == ' x ' :
selectXref = True
elif c == ' t ' :
selectTrailer = True
elif c == ' s ' :
selectStartXref = True
elif c == ' i ' :
selectIndirectObject = True
else :
print ( ' Error: unknown --elements value %s ' % c )
return
else :
selectIndirectObject = True
if not options . search and not options . object and not options . reference and not options . type and not options . searchstream :
selectComment = True
selectXref = True
selectTrailer = True
selectStartXref = True
if options . type == ' - ' :
optionsType = ' '
else :
optionsType = options . type
while True :
object = oPDFParser . GetObject ( )
if object != None :
if options . stats :
if object . type == PDF_ELEMENT_COMMENT :
cntComment + = 1
elif object . type == PDF_ELEMENT_XREF :
cntXref + = 1
elif object . type == PDF_ELEMENT_TRAILER :
cntTrailer + = 1
elif object . type == PDF_ELEMENT_STARTXREF :
cntStartXref + = 1
elif object . type == PDF_ELEMENT_INDIRECT_OBJECT :
cntIndirectObject + = 1
type = object . GetType ( )
if not type in dicObjectTypes :
dicObjectTypes [ type ] = [ object . id ]
else :
dicObjectTypes [ type ] . append ( object . id )
else :
if object . type == PDF_ELEMENT_COMMENT and selectComment :
print ( ' PDF Comment %s ' % FormatOutput ( object . comment , options . raw ) )
print ( ' ' )
elif object . type == PDF_ELEMENT_XREF and selectXref :
if options . debug :
print ( ' xref %s ' % FormatOutput ( object . content , options . raw ) )
else :
print ( ' xref ' )
print ( ' ' )
elif object . type == PDF_ELEMENT_TRAILER and selectTrailer :
oPDFParseDictionary = cPDFParseDictionary ( object . content [ 1 : ] , options . nocanonicalizedoutput )
if oPDFParseDictionary == None :
print ( ' trailer %s ' % FormatOutput ( object . content , options . raw ) )
else :
print ( ' trailer ' )
oPDFParseDictionary . PrettyPrint ( ' ' )
print ( ' ' )
elif object . type == PDF_ELEMENT_STARTXREF and selectStartXref :
print ( ' startxref %d ' % object . index )
print ( ' ' )
elif object . type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject :
if options . search :
if object . Contains ( options . search ) :
PrintObject ( object , options )
elif options . object :
if object . id == eval ( options . object ) :
PrintObject ( object , options )
elif options . reference :
if object . References ( options . reference ) :
PrintObject ( object , options )
elif options . type :
if EqualCanonical ( object . GetType ( ) , optionsType ) :
PrintObject ( object , options )
elif options . hash :
print ( ' obj %d %d ' % ( object . id , object . version ) )
rawContent = FormatOutput ( object . content , True )
print ( ' len: %d md5: %s ' % ( len ( rawContent ) , hashlib . md5 ( rawContent ) . hexdigest ( ) ) )
print ( ' ' )
elif options . searchstream :
if object . StreamContains ( options . searchstream , not options . unfiltered , options . casesensitive , options . regex ) :
PrintObject ( object , options )
else :
PrintObject ( object , options )
elif object . type == PDF_ELEMENT_MALFORMED :
try :
fExtract = open ( options . extract , ' wb ' )
try :
fExtract . write ( C2BIP3 ( object . content ) )
except :
print ( ' Error writing file %s ' % options . extract )
fExtract . close ( )
except :
print ( ' Error writing file %s ' % options . extract )
else :
break
if options . stats :
print ( ' Comment: %s ' % cntComment )
print ( ' XREF: %s ' % cntXref )
print ( ' Trailer: %s ' % cntTrailer )
print ( ' StartXref: %s ' % cntStartXref )
print ( ' Indirect object: %s ' % cntIndirectObject )
names = dicObjectTypes . keys ( )
names . sort ( )
for key in names :
print ( ' %s %d : %s ' % ( key , len ( dicObjectTypes [ key ] ) , ' , ' . join ( map ( lambda x : ' %d ' % x , dicObjectTypes [ key ] ) ) ) )
def TestPythonVersion ( enforceMaximumVersion = False , enforceMinimumVersion = False ) :
if sys . version_info [ 0 : 3 ] > __maximum_python_version__ :
if enforceMaximumVersion :
print ( ' This program does not work with this version of Python ( %d . %d . %d ) ' % sys . version_info [ 0 : 3 ] )
print ( ' Please use Python version %d . %d . %d ' % __maximum_python_version__ )
sys . exit ( )
else :
print ( ' This program has not been tested with this version of Python ( %d . %d . %d ) ' % sys . version_info [ 0 : 3 ] )
print ( ' Should you encounter problems, please use Python version %d . %d . %d ' % __maximum_python_version__ )
if sys . version_info [ 0 : 3 ] < __minimum_python_version__ :
if enforceMinimumVersion :
print ( ' This program does not work with this version of Python ( %d . %d . %d ) ' % sys . version_info [ 0 : 3 ] )
print ( ' Please use Python version %d . %d . %d ' % __maximum_python_version__ )
sys . exit ( )
else :
print ( ' This program has not been tested with this version of Python ( %d . %d . %d ) ' % sys . version_info [ 0 : 3 ] )
print ( ' Should you encounter problems, please use Python version %d . %d . %d ' % __maximum_python_version__ )
if __name__ == ' __main__ ' :
TestPythonVersion ( )
Main ( )