2020-05-03 12:47:01 +00:00
#!/usr/bin/python
2015-03-15 12:25:47 +00:00
__description__ = ' pdf-parser, use it to parse a PDF document '
__author__ = ' Didier Stevens '
2020-05-03 12:47:01 +00:00
__version__ = ' 0.7.4 '
__date__ = ' 2019/11/05 '
2015-03-15 12:25:47 +00:00
__minimum_python_version__ = ( 2 , 5 , 1 )
2020-05-03 12:47:01 +00:00
__maximum_python_version__ = ( 3 , 7 , 5 )
2015-03-15 12:25:47 +00:00
"""
Source code put in public domain by Didier Stevens , no Copyright
https : / / DidierStevens . com
Use at your own risk
History :
2008 / 05 / 02 : continue
2008 / 05 / 03 : continue
2008 / 06 / 02 : streams
2008 / 10 / 19 : refactor , grep & extract functionality
2008 / 10 / 20 : reference
2008 / 10 / 21 : cleanup
2008 / 11 / 12 : V0 .3 dictionary parser
2008 / 11 / 13 : option elements
2008 / 11 / 14 : continue
2009 / 05 / 05 : added / ASCIIHexDecode support ( thanks Justin Prosco )
2009 / 05 / 11 : V0 .3 .1 updated usage , added - - verbose and - - extract
2009 / 07 / 16 : V0 .3 .2 Added Canonicalize ( thanks Justin Prosco )
2009 / 07 / 18 : bugfix EqualCanonical
2009 / 07 / 24 : V0 .3 .3 Added - - hash option
2009 / 07 / 25 : EqualCanonical for option - - type , added option - - nocanonicalizedoutput
2009 / 07 / 28 : V0 .3 .4 Added ASCII85Decode support
2009 / 08 / 01 : V0 .3 .5 Updated ASCIIHexDecode to support whitespace obfuscation
2009 / 08 / 30 : V0 .3 .6 TestPythonVersion
2010 / 01 / 08 : V0 .3 .7 Added RLE and LZW support ( thanks pARODY ) ; added dump option
2010 / 01 / 09 : Fixed parsing of incomplete startxref
2010 / 09 / 22 : V0 .3 .8 Changed dump option , updated PrettyPrint , added debug option
2011 / 12 / 17 : fixed bugs empty objects
2012 / 03 / 11 : V0 .3 .9 fixed bugs double nested [ ] in PrettyPrintSub ( thanks kurt )
2013 / 01 / 11 : V0 .3 .10 Extract and dump bug fixes by Priit ; added content option
2013 / 02 / 16 : Performance improvement in cPDFTokenizer by using StringIO for token building by Christophe Vandeplas ; xrange replaced with range
2013 / 02 / 16 : V0 .4 .0 added http / https support ; added error handling for missing file or URL ; ; added support for ZIP file with password ' infected '
2013 / 03 / 13 : V0 .4 .1 fixes for Python 3
2013 / 04 / 11 : V0 .4 .2 modified PrettyPrintSub for strings with unprintable characters
2013 / 05 / 04 : Added options searchstream , unfiltered , casesensitive , regex
2013 / 09 / 18 : V0 .4 .3 fixed regression bug - w option
2020-05-03 12:47:01 +00:00
2014 / 09 / 25 : V0 .5 .0 added option - g
2014 / 09 / 29 : Added PrintGenerateObject and PrintOutputObject
2014 / 12 / 05 : V0 .6 .0 Added YARA support
2014 / 12 / 09 : cleanup , refactoring
2014 / 12 / 13 : Python 3 fixes
2015 / 01 / 11 : Added support for multiple YARA rule files ; added request to search in trailer
2015 / 01 / 31 : V0 .6 .1 Added optionyarastrings
2015 / 02 / 09 : Added decoders
2015 / 04 / 05 : V0 .6 .2 Added generateembedded
2015 / 04 / 06 : fixed bug reported by Kurt for stream produced by Ghostscript where endstream is not preceded by whitespace ; fixed prettyprint bug
2015 / 04 / 24 : V0 .6 .3 when option dump ' s filename is -, content is dumped to stdout
2015 / 08 / 12 : V0 .6 .4 option hash now also calculates hashes of streams when selecting or searching objects ; and displays hexasciidump first line
2016 / 07 / 27 : V0 .6 .5 bugfix whitespace 0x00 0x0C after stream 0x0D 0x0A reported by @mr_me
2016 / 11 / 20 : V0 .6 .6 added workaround zlib errors FlateDecode
2016 / 12 / 17 : V0 .6 .7 added option - k
2017 / 01 / 07 : V0 .6 .8 changed cPDFParseDictionary to handle strings ( ) with % character
2017 / 10 / 28 : fixed bug
2017 / 10 / 29 : added # support for option -y
2018 / 06 / 29 : V0 .6 .9 added option - - overridingfilters
2018 / 10 / 20 : added keywords to statistics
2019 / 02 / 22 : V0 .7 .0 added option - O - - objstm to parse the stream of / ObjStm objects , inspired by a contributor wishing anonymity
2019 / 03 / 01 : V0 .7 .1 added ContainsName for correct keyword statistics ( - a )
2019 / 04 / 12 : V0 .7 .2 Python 2.6 .6 compatibility fix
2019 / 07 / 30 : bug fixes ( including fixes Josef Hinteregger )
2019 / 09 / 26 : V0 .7 .3 added multiple id selection to option - o ; added man page ( - m ) ; added environment variable PDFPARSER_OPTIONS ; bug fixes
2019 / 11 / 05 : V0 .7 .4 fixed plugin path when compiled with pyinstaller , replaced eval with int
2015-03-15 12:25:47 +00:00
Todo :
- handle printf todo
- support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D . pdf . vir
"""
import re
import optparse
import zlib
import binascii
import hashlib
import sys
import zipfile
2020-05-03 12:47:01 +00:00
import time
import os
import textwrap
2015-03-15 12:25:47 +00:00
if sys . version_info [ 0 ] > = 3 :
from io import StringIO
import urllib . request
urllib23 = urllib . request
2020-05-03 12:47:01 +00:00
import configparser as ConfigParser
2015-03-15 12:25:47 +00:00
else :
from cStringIO import StringIO
import urllib2
urllib23 = urllib2
2020-05-03 12:47:01 +00:00
import ConfigParser
try :
import yara
except :
pass
2015-03-15 12:25:47 +00:00
CHAR_WHITESPACE = 1
CHAR_DELIMITER = 2
CHAR_REGULAR = 3
CONTEXT_NONE = 1
CONTEXT_OBJ = 2
CONTEXT_XREF = 3
CONTEXT_TRAILER = 4
PDF_ELEMENT_COMMENT = 1
PDF_ELEMENT_INDIRECT_OBJECT = 2
PDF_ELEMENT_XREF = 3
PDF_ELEMENT_TRAILER = 4
PDF_ELEMENT_STARTXREF = 5
PDF_ELEMENT_MALFORMED = 6
2020-05-03 12:47:01 +00:00
dumplinelength = 16
def PrintManual ( ) :
manual = '''
Manual :
This manual is a work in progress .
There is a free PDF analysis book :
https : / / blog . didierstevens . com / 2010 / 09 / 26 / free - malicious - pdf - analysis - e - book /
Option - o is used to select objects by id . Provide a single id or multiple ids separated by a comma ( , ) .
When environment variable PDFPARSER_OPTIONS is defined , the options it defines are added implicitely to the command line arguments .
Use this to define options you want included with each use of pdf - parser . py .
Like option - O , to parse stream objects ( / ObjStm ) .
By defining PDFPARSER_OPTIONS = - O , pdf - parser will always parse stream objects ( when found ) .
PS : this feature is experimental .
'''
for line in manual . split ( ' \n ' ) :
print ( textwrap . fill ( line ) )
2015-03-15 12:25:47 +00:00
#Convert 2 Bytes If Python 3
def C2BIP3 ( string ) :
if sys . version_info [ 0 ] > 2 :
2020-05-03 12:47:01 +00:00
if type ( string ) == bytes :
return string
else :
return bytes ( [ ord ( x ) for x in string ] )
2015-03-15 12:25:47 +00:00
else :
return string
2020-05-03 12:47:01 +00:00
#Convert 2 String If Python 3
def C2SIP3 ( bytes ) :
if sys . version_info [ 0 ] > 2 :
return ' ' . join ( [ chr ( byte ) for byte in bytes ] )
else :
return bytes
# CIC: Call If Callable
def CIC ( expression ) :
if callable ( expression ) :
return expression ( )
else :
return expression
# IFF: IF Function
def IFF ( expression , valueTrue , valueFalse ) :
if expression :
return CIC ( valueTrue )
else :
return CIC ( valueFalse )
def Timestamp ( epoch = None ) :
if epoch == None :
localTime = time . localtime ( )
else :
localTime = time . localtime ( epoch )
return ' %04d %02d %02d - %02d %02d %02d ' % localTime [ 0 : 6 ]
2015-03-15 12:25:47 +00:00
def CopyWithoutWhiteSpace ( content ) :
result = [ ]
for token in content :
if token [ 0 ] != CHAR_WHITESPACE :
result . append ( token )
return result
def Obj2Str ( content ) :
return ' ' . join ( map ( lambda x : repr ( x [ 1 ] ) [ 1 : - 1 ] , CopyWithoutWhiteSpace ( content ) ) )
class cPDFDocument :
def __init__ ( self , file ) :
self . file = file
2020-05-03 12:47:01 +00:00
if type ( file ) != str :
self . infile = file
elif file . lower ( ) . startswith ( ' http:// ' ) or file . lower ( ) . startswith ( ' https:// ' ) :
2015-03-15 12:25:47 +00:00
try :
if sys . hexversion > = 0x020601F0 :
self . infile = urllib23 . urlopen ( file , timeout = 5 )
else :
self . infile = urllib23 . urlopen ( file )
except urllib23 . HTTPError :
print ( ' Error accessing URL %s ' % file )
print ( sys . exc_info ( ) [ 1 ] )
sys . exit ( )
elif file . lower ( ) . endswith ( ' .zip ' ) :
try :
self . zipfile = zipfile . ZipFile ( file , ' r ' )
self . infile = self . zipfile . open ( self . zipfile . infolist ( ) [ 0 ] , ' r ' , C2BIP3 ( ' infected ' ) )
except :
print ( ' Error opening file %s ' % file )
print ( sys . exc_info ( ) [ 1 ] )
sys . exit ( )
else :
try :
self . infile = open ( file , ' rb ' )
except :
print ( ' Error opening file %s ' % file )
print ( sys . exc_info ( ) [ 1 ] )
sys . exit ( )
self . ungetted = [ ]
self . position = - 1
def byte ( self ) :
if len ( self . ungetted ) != 0 :
self . position + = 1
return self . ungetted . pop ( )
inbyte = self . infile . read ( 1 )
if not inbyte or inbyte == ' ' :
self . infile . close ( )
return None
self . position + = 1
return ord ( inbyte )
def unget ( self , byte ) :
self . position - = 1
self . ungetted . append ( byte )
def CharacterClass ( byte ) :
if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32 :
return CHAR_WHITESPACE
if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25 :
return CHAR_DELIMITER
return CHAR_REGULAR
def IsNumeric ( str ) :
return re . match ( ' ^[0-9]+ ' , str )
class cPDFTokenizer :
def __init__ ( self , file ) :
self . oPDF = cPDFDocument ( file )
self . ungetted = [ ]
def Token ( self ) :
if len ( self . ungetted ) != 0 :
return self . ungetted . pop ( )
if self . oPDF == None :
return None
self . byte = self . oPDF . byte ( )
if self . byte == None :
self . oPDF = None
return None
elif CharacterClass ( self . byte ) == CHAR_WHITESPACE :
file_str = StringIO ( )
while self . byte != None and CharacterClass ( self . byte ) == CHAR_WHITESPACE :
file_str . write ( chr ( self . byte ) )
self . byte = self . oPDF . byte ( )
if self . byte != None :
self . oPDF . unget ( self . byte )
else :
self . oPDF = None
self . token = file_str . getvalue ( )
return ( CHAR_WHITESPACE , self . token )
elif CharacterClass ( self . byte ) == CHAR_REGULAR :
file_str = StringIO ( )
while self . byte != None and CharacterClass ( self . byte ) == CHAR_REGULAR :
file_str . write ( chr ( self . byte ) )
self . byte = self . oPDF . byte ( )
if self . byte != None :
self . oPDF . unget ( self . byte )
else :
self . oPDF = None
self . token = file_str . getvalue ( )
return ( CHAR_REGULAR , self . token )
else :
if self . byte == 0x3C :
self . byte = self . oPDF . byte ( )
if self . byte == 0x3C :
return ( CHAR_DELIMITER , ' << ' )
else :
self . oPDF . unget ( self . byte )
return ( CHAR_DELIMITER , ' < ' )
elif self . byte == 0x3E :
self . byte = self . oPDF . byte ( )
if self . byte == 0x3E :
return ( CHAR_DELIMITER , ' >> ' )
else :
self . oPDF . unget ( self . byte )
return ( CHAR_DELIMITER , ' > ' )
elif self . byte == 0x25 :
file_str = StringIO ( )
while self . byte != None :
file_str . write ( chr ( self . byte ) )
if self . byte == 10 or self . byte == 13 :
self . byte = self . oPDF . byte ( )
break
self . byte = self . oPDF . byte ( )
if self . byte != None :
if self . byte == 10 :
file_str . write ( chr ( self . byte ) )
else :
self . oPDF . unget ( self . byte )
else :
self . oPDF = None
self . token = file_str . getvalue ( )
return ( CHAR_DELIMITER , self . token )
return ( CHAR_DELIMITER , chr ( self . byte ) )
def TokenIgnoreWhiteSpace ( self ) :
token = self . Token ( )
while token != None and token [ 0 ] == CHAR_WHITESPACE :
token = self . Token ( )
return token
2020-05-03 12:47:01 +00:00
def Tokens ( self ) :
tokens = [ ]
token = self . Token ( )
while token != None :
tokens . append ( token )
token = self . Token ( )
return tokens
2015-03-15 12:25:47 +00:00
def unget ( self , byte ) :
self . ungetted . append ( byte )
class cPDFParser :
2020-05-03 12:47:01 +00:00
def __init__ ( self , file , verbose = False , extract = None , objstm = None ) :
2015-03-15 12:25:47 +00:00
self . context = CONTEXT_NONE
self . content = [ ]
self . oPDFTokenizer = cPDFTokenizer ( file )
self . verbose = verbose
self . extract = extract
2020-05-03 12:47:01 +00:00
self . objstm = objstm
2015-03-15 12:25:47 +00:00
def GetObject ( self ) :
while True :
if self . context == CONTEXT_OBJ :
self . token = self . oPDFTokenizer . Token ( )
else :
self . token = self . oPDFTokenizer . TokenIgnoreWhiteSpace ( )
if self . token :
if self . token [ 0 ] == CHAR_DELIMITER :
if self . token [ 1 ] [ 0 ] == ' % ' :
if self . context == CONTEXT_OBJ :
self . content . append ( self . token )
else :
return cPDFElementComment ( self . token [ 1 ] )
elif self . token [ 1 ] == ' / ' :
self . token2 = self . oPDFTokenizer . Token ( )
if self . token2 [ 0 ] == CHAR_REGULAR :
if self . context != CONTEXT_NONE :
self . content . append ( ( CHAR_DELIMITER , self . token [ 1 ] + self . token2 [ 1 ] ) )
elif self . verbose :
print ( ' todo 1: %s ' % ( self . token [ 1 ] + self . token2 [ 1 ] ) )
else :
self . oPDFTokenizer . unget ( self . token2 )
if self . context != CONTEXT_NONE :
self . content . append ( self . token )
elif self . verbose :
print ( ' todo 2: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
elif self . context != CONTEXT_NONE :
self . content . append ( self . token )
elif self . verbose :
print ( ' todo 3: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
elif self . token [ 0 ] == CHAR_WHITESPACE :
if self . context != CONTEXT_NONE :
self . content . append ( self . token )
elif self . verbose :
print ( ' todo 4: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
else :
if self . context == CONTEXT_OBJ :
if self . token [ 1 ] == ' endobj ' :
2020-05-03 12:47:01 +00:00
self . oPDFElementIndirectObject = cPDFElementIndirectObject ( self . objectId , self . objectVersion , self . content , self . objstm )
2015-03-15 12:25:47 +00:00
self . context = CONTEXT_NONE
self . content = [ ]
return self . oPDFElementIndirectObject
else :
self . content . append ( self . token )
elif self . context == CONTEXT_TRAILER :
if self . token [ 1 ] == ' startxref ' or self . token [ 1 ] == ' xref ' :
self . oPDFElementTrailer = cPDFElementTrailer ( self . content )
self . oPDFTokenizer . unget ( self . token )
self . context = CONTEXT_NONE
self . content = [ ]
return self . oPDFElementTrailer
else :
self . content . append ( self . token )
elif self . context == CONTEXT_XREF :
if self . token [ 1 ] == ' trailer ' or self . token [ 1 ] == ' xref ' :
self . oPDFElementXref = cPDFElementXref ( self . content )
self . oPDFTokenizer . unget ( self . token )
self . context = CONTEXT_NONE
self . content = [ ]
return self . oPDFElementXref
else :
self . content . append ( self . token )
else :
if IsNumeric ( self . token [ 1 ] ) :
self . token2 = self . oPDFTokenizer . TokenIgnoreWhiteSpace ( )
if IsNumeric ( self . token2 [ 1 ] ) :
self . token3 = self . oPDFTokenizer . TokenIgnoreWhiteSpace ( )
if self . token3 [ 1 ] == ' obj ' :
2020-05-03 12:47:01 +00:00
self . objectId = int ( self . token [ 1 ] , 10 )
self . objectVersion = int ( self . token2 [ 1 ] , 10 )
2015-03-15 12:25:47 +00:00
self . context = CONTEXT_OBJ
else :
self . oPDFTokenizer . unget ( self . token3 )
self . oPDFTokenizer . unget ( self . token2 )
if self . verbose :
print ( ' todo 6: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
else :
self . oPDFTokenizer . unget ( self . token2 )
if self . verbose :
print ( ' todo 7: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
elif self . token [ 1 ] == ' trailer ' :
self . context = CONTEXT_TRAILER
self . content = [ self . token ]
elif self . token [ 1 ] == ' xref ' :
self . context = CONTEXT_XREF
self . content = [ self . token ]
elif self . token [ 1 ] == ' startxref ' :
self . token2 = self . oPDFTokenizer . TokenIgnoreWhiteSpace ( )
if self . token2 and IsNumeric ( self . token2 [ 1 ] ) :
2020-05-03 12:47:01 +00:00
return cPDFElementStartxref ( int ( self . token2 [ 1 ] , 10 ) )
2015-03-15 12:25:47 +00:00
else :
self . oPDFTokenizer . unget ( self . token2 )
if self . verbose :
print ( ' todo 9: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
elif self . extract :
self . bytes = ' '
while self . token :
self . bytes + = self . token [ 1 ]
self . token = self . oPDFTokenizer . Token ( )
return cPDFElementMalformed ( self . bytes )
elif self . verbose :
print ( ' todo 10: %d %s ' % ( self . token [ 0 ] , repr ( self . token [ 1 ] ) ) )
else :
break
class cPDFElementComment :
def __init__ ( self , comment ) :
self . type = PDF_ELEMENT_COMMENT
self . comment = comment
# if re.match('^%PDF-[0-9]\.[0-9]', self.token[1]):
# print(repr(self.token[1]))
# elif re.match('^%%EOF', self.token[1]):
# print(repr(self.token[1]))
class cPDFElementXref :
def __init__ ( self , content ) :
self . type = PDF_ELEMENT_XREF
self . content = content
class cPDFElementTrailer :
def __init__ ( self , content ) :
self . type = PDF_ELEMENT_TRAILER
self . content = content
2020-05-03 12:47:01 +00:00
def Contains ( self , keyword ) :
data = ' '
for i in range ( 0 , len ( self . content ) ) :
if self . content [ i ] [ 1 ] == ' stream ' :
break
else :
data + = Canonicalize ( self . content [ i ] [ 1 ] )
return data . upper ( ) . find ( keyword . upper ( ) ) != - 1
2015-03-15 12:25:47 +00:00
def IIf ( expr , truepart , falsepart ) :
if expr :
return truepart
else :
return falsepart
class cPDFElementIndirectObject :
2020-05-03 12:47:01 +00:00
def __init__ ( self , id , version , content , objstm = None ) :
2015-03-15 12:25:47 +00:00
self . type = PDF_ELEMENT_INDIRECT_OBJECT
self . id = id
self . version = version
self . content = content
2020-05-03 12:47:01 +00:00
self . objstm = objstm
#fix stream for Ghostscript bug reported by Kurt
if self . ContainsStream ( ) :
position = len ( self . content ) - 1
if position < 0 :
return
while self . content [ position ] [ 0 ] == CHAR_WHITESPACE and position > = 0 :
position - = 1
if position < 0 :
return
if self . content [ position ] [ 0 ] != CHAR_REGULAR :
return
if self . content [ position ] [ 1 ] == ' endstream ' :
return
if not self . content [ position ] [ 1 ] . endswith ( ' endstream ' ) :
return
self . content = self . content [ 0 : position ] + [ ( self . content [ position ] [ 0 ] , self . content [ position ] [ 1 ] [ : - len ( ' endstream ' ) ] ) ] + [ ( self . content [ position ] [ 0 ] , ' endstream ' ) ] + self . content [ position + 1 : ]
2015-03-15 12:25:47 +00:00
def GetType ( self ) :
content = CopyWithoutWhiteSpace ( self . content )
dictionary = 0
for i in range ( 0 , len ( content ) ) :
if content [ i ] [ 0 ] == CHAR_DELIMITER and content [ i ] [ 1 ] == ' << ' :
dictionary + = 1
if content [ i ] [ 0 ] == CHAR_DELIMITER and content [ i ] [ 1 ] == ' >> ' :
dictionary - = 1
if dictionary == 1 and content [ i ] [ 0 ] == CHAR_DELIMITER and EqualCanonical ( content [ i ] [ 1 ] , ' /Type ' ) and i < len ( content ) - 1 :
return content [ i + 1 ] [ 1 ]
return ' '
def GetReferences ( self ) :
content = CopyWithoutWhiteSpace ( self . content )
references = [ ]
for i in range ( 0 , len ( content ) ) :
if i > 1 and content [ i ] [ 0 ] == CHAR_REGULAR and content [ i ] [ 1 ] == ' R ' and content [ i - 2 ] [ 0 ] == CHAR_REGULAR and IsNumeric ( content [ i - 2 ] [ 1 ] ) and content [ i - 1 ] [ 0 ] == CHAR_REGULAR and IsNumeric ( content [ i - 1 ] [ 1 ] ) :
references . append ( ( content [ i - 2 ] [ 1 ] , content [ i - 1 ] [ 1 ] , content [ i ] [ 1 ] ) )
return references
def References ( self , index ) :
for ref in self . GetReferences ( ) :
if ref [ 0 ] == index :
return True
return False
def ContainsStream ( self ) :
for i in range ( 0 , len ( self . content ) ) :
if self . content [ i ] [ 0 ] == CHAR_REGULAR and self . content [ i ] [ 1 ] == ' stream ' :
return self . content [ 0 : i ]
return False
def Contains ( self , keyword ) :
data = ' '
for i in range ( 0 , len ( self . content ) ) :
if self . content [ i ] [ 1 ] == ' stream ' :
break
else :
data + = Canonicalize ( self . content [ i ] [ 1 ] )
return data . upper ( ) . find ( keyword . upper ( ) ) != - 1
2020-05-03 12:47:01 +00:00
def ContainsName ( self , keyword ) :
for token in self . content :
if token [ 1 ] == ' stream ' :
return False
if token [ 0 ] == CHAR_DELIMITER and EqualCanonical ( token [ 1 ] , keyword ) :
return True
return False
def StreamContains ( self , keyword , filter , casesensitive , regex , overridingfilters ) :
2015-03-15 12:25:47 +00:00
if not self . ContainsStream ( ) :
return False
2020-05-03 12:47:01 +00:00
streamData = self . Stream ( filter , overridingfilters )
2015-03-15 12:25:47 +00:00
if filter and streamData == ' No filters ' :
2020-05-03 12:47:01 +00:00
streamData = self . Stream ( False , overridingfilters )
2015-03-15 12:25:47 +00:00
if regex :
return re . search ( keyword , streamData , IIf ( casesensitive , 0 , re . I ) )
elif casesensitive :
return keyword in streamData
else :
return keyword . lower ( ) in streamData . lower ( )
2020-05-03 12:47:01 +00:00
def Stream ( self , filter = True , overridingfilters = ' ' ) :
2015-03-15 12:25:47 +00:00
state = ' start '
countDirectories = 0
data = ' '
filters = [ ]
for i in range ( 0 , len ( self . content ) ) :
if state == ' start ' :
if self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] == ' << ' :
countDirectories + = 1
if self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] == ' >> ' :
countDirectories - = 1
if countDirectories == 1 and self . content [ i ] [ 0 ] == CHAR_DELIMITER and EqualCanonical ( self . content [ i ] [ 1 ] , ' /Filter ' ) :
state = ' filter '
elif countDirectories == 0 and self . content [ i ] [ 0 ] == CHAR_REGULAR and self . content [ i ] [ 1 ] == ' stream ' :
state = ' stream-whitespace '
elif state == ' filter ' :
if self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] [ 0 ] == ' / ' :
filters = [ self . content [ i ] [ 1 ] ]
state = ' search-stream '
elif self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] == ' [ ' :
state = ' filter-list '
elif state == ' filter-list ' :
if self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] [ 0 ] == ' / ' :
filters . append ( self . content [ i ] [ 1 ] )
elif self . content [ i ] [ 0 ] == CHAR_DELIMITER and self . content [ i ] [ 1 ] == ' ] ' :
state = ' search-stream '
elif state == ' search-stream ' :
if self . content [ i ] [ 0 ] == CHAR_REGULAR and self . content [ i ] [ 1 ] == ' stream ' :
state = ' stream-whitespace '
elif state == ' stream-whitespace ' :
2020-05-03 12:47:01 +00:00
if self . content [ i ] [ 0 ] == CHAR_WHITESPACE :
whitespace = self . content [ i ] [ 1 ]
if whitespace . startswith ( ' \x0D \x0A ' ) and len ( whitespace ) > 2 :
data + = whitespace [ 2 : ]
elif whitespace . startswith ( ' \x0A ' ) and len ( whitespace ) > 1 :
data + = whitespace [ 1 : ]
else :
2015-03-15 12:25:47 +00:00
data + = self . content [ i ] [ 1 ]
state = ' stream-concat '
elif state == ' stream-concat ' :
if self . content [ i ] [ 0 ] == CHAR_REGULAR and self . content [ i ] [ 1 ] == ' endstream ' :
if filter :
2020-05-03 12:47:01 +00:00
if overridingfilters == ' ' :
return self . Decompress ( data , filters )
elif overridingfilters == ' raw ' :
return data
else :
return self . Decompress ( data , overridingfilters . split ( ' ' ) )
2015-03-15 12:25:47 +00:00
else :
return data
else :
data + = self . content [ i ] [ 1 ]
else :
return ' Unexpected filter state '
return filters
def Decompress ( self , data , filters ) :
for filter in filters :
if EqualCanonical ( filter , ' /FlateDecode ' ) or EqualCanonical ( filter , ' /Fl ' ) :
try :
data = FlateDecode ( data )
2020-05-03 12:47:01 +00:00
except zlib . error as e :
2015-03-15 12:25:47 +00:00
message = ' FlateDecode decompress failed '
if len ( data ) > 0 and ord ( data [ 0 ] ) & 0x0F != 8 :
message + = ' , unexpected compression method: %02x ' % ord ( data [ 0 ] )
return message + ' . zlib.error %s ' % e . message
elif EqualCanonical ( filter , ' /ASCIIHexDecode ' ) or EqualCanonical ( filter , ' /AHx ' ) :
try :
data = ASCIIHexDecode ( data )
except :
return ' ASCIIHexDecode decompress failed '
elif EqualCanonical ( filter , ' /ASCII85Decode ' ) or EqualCanonical ( filter , ' /A85 ' ) :
try :
data = ASCII85Decode ( data . rstrip ( ' > ' ) )
except :
return ' ASCII85Decode decompress failed '
elif EqualCanonical ( filter , ' /LZWDecode ' ) or EqualCanonical ( filter , ' /LZW ' ) :
try :
data = LZWDecode ( data )
except :
return ' LZWDecode decompress failed '
elif EqualCanonical ( filter , ' /RunLengthDecode ' ) or EqualCanonical ( filter , ' /R ' ) :
try :
data = RunLengthDecode ( data )
except :
return ' RunLengthDecode decompress failed '
# elif i.startswith('/CC') # CCITTFaxDecode
# elif i.startswith('/DCT') # DCTDecode
else :
return ' Unsupported filter: %s ' % repr ( filters )
if len ( filters ) == 0 :
return ' No filters '
else :
return data
2020-05-03 12:47:01 +00:00
def StreamYARAMatch ( self , rules , decoders , decoderoptions , filter , overridingfilters ) :
if not self . ContainsStream ( ) :
return None
streamData = self . Stream ( filter , overridingfilters )
if filter and streamData == ' No filters ' :
streamData = self . Stream ( False , overridingfilters )
oDecoders = [ cIdentity ( streamData , None ) ]
for cDecoder in decoders :
try :
oDecoder = cDecoder ( streamData , decoderoptions )
oDecoders . append ( oDecoder )
except Exception as e :
print ( ' Error instantiating decoder: %s ' % cDecoder . name )
raise e
results = [ ]
for oDecoder in oDecoders :
while oDecoder . Available ( ) :
yaraResults = rules . match ( data = oDecoder . Decode ( ) )
if yaraResults != [ ] :
results . append ( [ oDecoder . Name ( ) , yaraResults ] )
return results
2015-03-15 12:25:47 +00:00
class cPDFElementStartxref :
def __init__ ( self , index ) :
self . type = PDF_ELEMENT_STARTXREF
self . index = index
class cPDFElementMalformed :
def __init__ ( self , content ) :
self . type = PDF_ELEMENT_MALFORMED
self . content = content
def TrimLWhiteSpace ( data ) :
while data != [ ] and data [ 0 ] [ 0 ] == CHAR_WHITESPACE :
data = data [ 1 : ]
return data
def TrimRWhiteSpace ( data ) :
while data != [ ] and data [ - 1 ] [ 0 ] == CHAR_WHITESPACE :
data = data [ : - 1 ]
return data
class cPDFParseDictionary :
def __init__ ( self , content , nocanonicalizedoutput ) :
self . content = content
self . nocanonicalizedoutput = nocanonicalizedoutput
dataTrimmed = TrimLWhiteSpace ( TrimRWhiteSpace ( self . content ) )
if dataTrimmed == [ ] :
self . parsed = None
2020-05-03 12:47:01 +00:00
elif self . isOpenDictionary ( dataTrimmed [ 0 ] ) and ( self . isCloseDictionary ( dataTrimmed [ - 1 ] ) or self . couldBeCloseDictionary ( dataTrimmed [ - 1 ] ) ) :
2015-03-15 12:25:47 +00:00
self . parsed = self . ParseDictionary ( dataTrimmed ) [ 0 ]
else :
self . parsed = None
def isOpenDictionary ( self , token ) :
return token [ 0 ] == CHAR_DELIMITER and token [ 1 ] == ' << '
def isCloseDictionary ( self , token ) :
return token [ 0 ] == CHAR_DELIMITER and token [ 1 ] == ' >> '
2020-05-03 12:47:01 +00:00
def couldBeCloseDictionary ( self , token ) :
return token [ 0 ] == CHAR_DELIMITER and token [ 1 ] . rstrip ( ) . endswith ( ' >> ' )
2015-03-15 12:25:47 +00:00
def ParseDictionary ( self , tokens ) :
state = 0 # start
dictionary = [ ]
while tokens != [ ] :
if state == 0 :
if self . isOpenDictionary ( tokens [ 0 ] ) :
state = 1
else :
return None , tokens
elif state == 1 :
if self . isOpenDictionary ( tokens [ 0 ] ) :
pass
elif self . isCloseDictionary ( tokens [ 0 ] ) :
return dictionary , tokens
elif tokens [ 0 ] [ 0 ] != CHAR_WHITESPACE :
key = ConditionalCanonicalize ( tokens [ 0 ] [ 1 ] , self . nocanonicalizedoutput )
value = [ ]
state = 2
elif state == 2 :
if self . isOpenDictionary ( tokens [ 0 ] ) :
value , tokens = self . ParseDictionary ( tokens )
dictionary . append ( ( key , value ) )
state = 1
elif self . isCloseDictionary ( tokens [ 0 ] ) :
dictionary . append ( ( key , value ) )
return dictionary , tokens
elif value == [ ] and tokens [ 0 ] [ 0 ] == CHAR_WHITESPACE :
pass
elif value == [ ] and tokens [ 0 ] [ 1 ] == ' [ ' :
value . append ( tokens [ 0 ] [ 1 ] )
elif value != [ ] and value [ 0 ] == ' [ ' and tokens [ 0 ] [ 1 ] != ' ] ' :
value . append ( tokens [ 0 ] [ 1 ] )
elif value != [ ] and value [ 0 ] == ' [ ' and tokens [ 0 ] [ 1 ] == ' ] ' :
value . append ( tokens [ 0 ] [ 1 ] )
dictionary . append ( ( key , value ) )
value = [ ]
state = 1
2020-05-03 12:47:01 +00:00
elif value == [ ] and tokens [ 0 ] [ 1 ] == ' ( ' :
value . append ( tokens [ 0 ] [ 1 ] )
elif value != [ ] and value [ 0 ] == ' ( ' and tokens [ 0 ] [ 1 ] != ' ) ' :
if tokens [ 0 ] [ 1 ] [ 0 ] == ' % ' :
tokens = [ tokens [ 0 ] ] + cPDFTokenizer ( StringIO ( tokens [ 0 ] [ 1 ] [ 1 : ] ) ) . Tokens ( ) + tokens [ 1 : ]
value . append ( ' % ' )
else :
value . append ( tokens [ 0 ] [ 1 ] )
elif value != [ ] and value [ 0 ] == ' ( ' and tokens [ 0 ] [ 1 ] == ' ) ' :
value . append ( tokens [ 0 ] [ 1 ] )
balanced = 0
for item in value :
if item == ' ( ' :
balanced + = 1
elif item == ' ) ' :
balanced - = 1
if balanced < 0 and self . verbose :
print ( ' todo 11: ' + repr ( value ) )
if balanced < 1 :
dictionary . append ( ( key , value ) )
value = [ ]
state = 1
2015-03-15 12:25:47 +00:00
elif value != [ ] and tokens [ 0 ] [ 1 ] [ 0 ] == ' / ' :
dictionary . append ( ( key , value ) )
key = ConditionalCanonicalize ( tokens [ 0 ] [ 1 ] , self . nocanonicalizedoutput )
value = [ ]
state = 2
else :
value . append ( ConditionalCanonicalize ( tokens [ 0 ] [ 1 ] , self . nocanonicalizedoutput ) )
tokens = tokens [ 1 : ]
2020-05-03 12:47:01 +00:00
def Retrieve ( self ) :
2015-03-15 12:25:47 +00:00
return self . parsed
2020-05-03 12:47:01 +00:00
def PrettyPrintSubElement ( self , prefix , e ) :
if e [ 1 ] == [ ] :
print ( ' %s %s ' % ( prefix , e [ 0 ] ) )
elif type ( e [ 1 ] [ 0 ] ) == type ( ' ' ) :
if len ( e [ 1 ] ) == 3 and IsNumeric ( e [ 1 ] [ 0 ] ) and e [ 1 ] [ 1 ] == ' 0 ' and e [ 1 ] [ 2 ] == ' R ' :
joiner = ' '
else :
joiner = ' '
value = joiner . join ( e [ 1 ] ) . strip ( )
reprValue = repr ( value )
if " ' " + value + " ' " != reprValue :
value = reprValue
print ( ' %s %s %s ' % ( prefix , e [ 0 ] , value ) )
else :
print ( ' %s %s ' % ( prefix , e [ 0 ] ) )
self . PrettyPrintSub ( prefix + ' ' , e [ 1 ] )
2015-03-15 12:25:47 +00:00
def PrettyPrintSub ( self , prefix , dictionary ) :
if dictionary != None :
print ( ' %s << ' % prefix )
for e in dictionary :
2020-05-03 12:47:01 +00:00
self . PrettyPrintSubElement ( prefix , e )
2015-03-15 12:25:47 +00:00
print ( ' %s >> ' % prefix )
def PrettyPrint ( self , prefix ) :
self . PrettyPrintSub ( prefix , self . parsed )
2020-05-03 12:47:01 +00:00
def Get ( self , select ) :
for key , value in self . parsed :
if key == select :
return value
return None
def GetNestedSub ( self , dictionary , select ) :
for key , value in dictionary :
if key == select :
return self . PrettyPrintSubElement ( ' ' , [ select , value ] )
if type ( value ) == type ( [ ] ) and len ( value ) > 0 and type ( value [ 0 ] ) == type ( ( None , ) ) :
result = self . GetNestedSub ( value , select )
if result != None :
return self . PrettyPrintSubElement ( ' ' , [ select , result ] )
return None
def GetNested ( self , select ) :
return self . GetNestedSub ( self . parsed , select )
2015-03-15 12:25:47 +00:00
def FormatOutput ( data , raw ) :
if raw :
if type ( data ) == type ( [ ] ) :
return ' ' . join ( map ( lambda x : x [ 1 ] , data ) )
else :
return data
2020-05-03 12:47:01 +00:00
elif sys . version_info [ 0 ] > 2 :
return ascii ( data )
2015-03-15 12:25:47 +00:00
else :
return repr ( data )
2020-05-03 12:47:01 +00:00
#Fix for http://bugs.python.org/issue11395
def StdoutWriteChunked ( data ) :
if sys . version_info [ 0 ] > 2 :
sys . stdout . buffer . write ( data )
else :
while data != ' ' :
sys . stdout . write ( data [ 0 : 10000 ] )
try :
sys . stdout . flush ( )
except IOError :
return
data = data [ 10000 : ]
def IfWIN32SetBinary ( io ) :
if sys . platform == ' win32 ' :
import msvcrt
msvcrt . setmode ( io . fileno ( ) , os . O_BINARY )
def PrintOutputObject ( object , options ) :
if options . dump == ' - ' :
filtered = object . Stream ( options . filter == True , options . overridingfilters )
if filtered == [ ] :
filtered = ' '
IfWIN32SetBinary ( sys . stdout )
StdoutWriteChunked ( filtered )
return
2015-03-15 12:25:47 +00:00
print ( ' obj %d %d ' % ( object . id , object . version ) )
2020-05-03 12:47:01 +00:00
if object . objstm != None :
print ( ' Containing /ObjStm: %d %d ' % object . objstm )
2015-03-15 12:25:47 +00:00
print ( ' Type: %s ' % ConditionalCanonicalize ( object . GetType ( ) , options . nocanonicalizedoutput ) )
print ( ' Referencing: %s ' % ' , ' . join ( map ( lambda x : ' %s %s %s ' % x , object . GetReferences ( ) ) ) )
dataPrecedingStream = object . ContainsStream ( )
oPDFParseDictionary = None
if dataPrecedingStream :
print ( ' Contains stream ' )
if options . debug :
print ( ' %s ' % FormatOutput ( dataPrecedingStream , options . raw ) )
oPDFParseDictionary = cPDFParseDictionary ( dataPrecedingStream , options . nocanonicalizedoutput )
2020-05-03 12:47:01 +00:00
if options . hash :
streamContent = object . Stream ( False , options . overridingfilters )
print ( ' unfiltered ' )
print ( ' len: %6d md5: %s ' % ( len ( streamContent ) , hashlib . md5 ( streamContent ) . hexdigest ( ) ) )
print ( ' %s ' % HexAsciiDumpLine ( streamContent ) )
streamContent = object . Stream ( True , options . overridingfilters )
print ( ' filtered ' )
print ( ' len: %6d md5: %s ' % ( len ( streamContent ) , hashlib . md5 ( streamContent ) . hexdigest ( ) ) )
print ( ' %s ' % HexAsciiDumpLine ( streamContent ) )
streamContent = None
2015-03-15 12:25:47 +00:00
else :
if options . debug or options . raw :
print ( ' %s ' % FormatOutput ( object . content , options . raw ) )
oPDFParseDictionary = cPDFParseDictionary ( object . content , options . nocanonicalizedoutput )
print ( ' ' )
oPDFParseDictionary . PrettyPrint ( ' ' )
print ( ' ' )
if options . filter and not options . dump :
2020-05-03 12:47:01 +00:00
filtered = object . Stream ( overridingfilters = options . overridingfilters )
2015-03-15 12:25:47 +00:00
if filtered == [ ] :
print ( ' %s ' % FormatOutput ( object . content , options . raw ) )
else :
print ( ' %s ' % FormatOutput ( filtered , options . raw ) )
if options . content :
if object . ContainsStream ( ) :
2020-05-03 12:47:01 +00:00
stream = object . Stream ( False , options . overridingfilters )
2015-03-15 12:25:47 +00:00
if stream != [ ] :
print ( ' %s ' % FormatOutput ( stream , options . raw ) )
else :
print ( ' ' . join ( [ token [ 1 ] for token in object . content ] ) )
if options . dump :
2020-05-03 12:47:01 +00:00
filtered = object . Stream ( options . filter == True , options . overridingfilters )
2015-03-15 12:25:47 +00:00
if filtered == [ ] :
filtered = ' '
try :
fDump = open ( options . dump , ' wb ' )
try :
fDump . write ( C2BIP3 ( filtered ) )
except :
print ( ' Error writing file %s ' % options . dump )
fDump . close ( )
except :
print ( ' Error writing file %s ' % options . dump )
print ( ' ' )
return
def Canonicalize ( sIn ) :
if sIn == ' ' :
return sIn
elif sIn [ 0 ] != ' / ' :
return sIn
elif sIn . find ( ' # ' ) == - 1 :
return sIn
else :
i = 0
iLen = len ( sIn )
sCanonical = ' '
while i < iLen :
if sIn [ i ] == ' # ' and i < iLen - 2 :
try :
sCanonical + = chr ( int ( sIn [ i + 1 : i + 3 ] , 16 ) )
i + = 2
except :
sCanonical + = sIn [ i ]
else :
sCanonical + = sIn [ i ]
i + = 1
return sCanonical
def EqualCanonical ( s1 , s2 ) :
return Canonicalize ( s1 ) == s2
def ConditionalCanonicalize ( sIn , nocanonicalizedoutput ) :
if nocanonicalizedoutput :
return sIn
else :
return Canonicalize ( sIn )
# http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer/pdfminer/ascii85.py
def ASCII85Decode ( data ) :
import struct
n = b = 0
out = ' '
for c in data :
if ' ! ' < = c and c < = ' u ' :
n + = 1
b = b * 85 + ( ord ( c ) - 33 )
if n == 5 :
out + = struct . pack ( ' >L ' , b )
n = b = 0
elif c == ' z ' :
assert n == 0
out + = ' \0 \0 \0 \0 '
elif c == ' ~ ' :
if n :
for _ in range ( 5 - n ) :
b = b * 85 + 84
out + = struct . pack ( ' >L ' , b ) [ : n - 1 ]
break
return out
def ASCIIHexDecode ( data ) :
return binascii . unhexlify ( ' ' . join ( [ c for c in data if c not in ' \t \n \r ' ] ) . rstrip ( ' > ' ) )
2020-05-03 12:47:01 +00:00
# if inflating fails, we try to inflate byte per byte (sample 4da299d6e52bbb79c0ac00bad6a1d51d4d5fe42965a8d94e88a359e5277117e2)
2015-03-15 12:25:47 +00:00
def FlateDecode ( data ) :
2020-05-03 12:47:01 +00:00
try :
return zlib . decompress ( C2BIP3 ( data ) )
except :
if len ( data ) < = 10 :
raise
oDecompress = zlib . decompressobj ( )
oStringIO = StringIO ( )
count = 0
for byte in C2BIP3 ( data ) :
try :
oStringIO . write ( oDecompress . decompress ( byte ) )
count + = 1
except :
break
if len ( data ) - count < = 2 :
return oStringIO . getvalue ( )
else :
raise
2015-03-15 12:25:47 +00:00
def RunLengthDecode ( data ) :
f = StringIO ( data )
decompressed = ' '
runLength = ord ( f . read ( 1 ) )
while runLength :
if runLength < 128 :
decompressed + = f . read ( runLength + 1 )
if runLength > 128 :
decompressed + = f . read ( 1 ) * ( 257 - runLength )
if runLength == 128 :
break
runLength = ord ( f . read ( 1 ) )
# return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)), data)
return decompressed
#### LZW code sourced from pdfminer
# Copyright (c) 2004-2009 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
class LZWDecoder ( object ) :
def __init__ ( self , fp ) :
self . fp = fp
self . buff = 0
self . bpos = 8
self . nbits = 9
self . table = None
self . prevbuf = None
return
def readbits ( self , bits ) :
v = 0
while 1 :
# the number of remaining bits we can get from the current buffer.
r = 8 - self . bpos
if bits < = r :
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = ( v << bits ) | ( ( self . buff >> ( r - bits ) ) & ( ( 1 << bits ) - 1 ) )
self . bpos + = bits
break
else :
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = ( v << r ) | ( self . buff & ( ( 1 << r ) - 1 ) )
bits - = r
x = self . fp . read ( 1 )
if not x : raise EOFError
self . buff = ord ( x )
self . bpos = 0
return v
def feed ( self , code ) :
x = ' '
if code == 256 :
self . table = [ chr ( c ) for c in range ( 256 ) ] # 0-255
self . table . append ( None ) # 256
self . table . append ( None ) # 257
self . prevbuf = ' '
self . nbits = 9
elif code == 257 :
pass
elif not self . prevbuf :
x = self . prevbuf = self . table [ code ]
else :
if code < len ( self . table ) :
x = self . table [ code ]
self . table . append ( self . prevbuf + x [ 0 ] )
else :
self . table . append ( self . prevbuf + self . prevbuf [ 0 ] )
x = self . table [ code ]
l = len ( self . table )
if l == 511 :
self . nbits = 10
elif l == 1023 :
self . nbits = 11
elif l == 2047 :
self . nbits = 12
self . prevbuf = x
return x
def run ( self ) :
while 1 :
try :
code = self . readbits ( self . nbits )
except EOFError :
break
x = self . feed ( code )
yield x
return
####
def LZWDecode ( data ) :
return ' ' . join ( LZWDecoder ( StringIO ( data ) ) . run ( ) )
2020-05-03 12:47:01 +00:00
def PrintGenerateObject ( object , options , newId = None ) :
if newId == None :
objectId = object . id
else :
objectId = newId
dataPrecedingStream = object . ContainsStream ( )
if dataPrecedingStream :
if options . filter :
decompressed = object . Stream ( True , options . overridingfilters )
if decompressed == ' No filters ' or decompressed . startswith ( ' Unsupported filter: ' ) :
print ( ' oPDF.stream( %d , %d , %s , %s ) ' % ( objectId , object . version , repr ( object . Stream ( False , options . overridingfilters ) . rstrip ( ) ) , repr ( re . sub ( ' /Length \ s+ \ d+ ' , ' /Length %d ' , FormatOutput ( dataPrecedingStream , True ) ) . strip ( ) ) ) )
else :
dictionary = FormatOutput ( dataPrecedingStream , True )
dictionary = re . sub ( r ' /Length \ s+ \ d+ ' , ' ' , dictionary )
dictionary = re . sub ( r ' /Filter \ s*/[a-zA-Z0-9]+ ' , ' ' , dictionary )
dictionary = re . sub ( r ' /Filter \ s* \ [.+ \ ] ' , ' ' , dictionary )
dictionary = re . sub ( r ' ^ \ s*<< ' , ' ' , dictionary )
dictionary = re . sub ( r ' >> \ s*$ ' , ' ' , dictionary )
dictionary = dictionary . strip ( )
print ( " oPDF.stream2( %d , %d , %s , %s , ' f ' ) " % ( objectId , object . version , repr ( decompressed . rstrip ( ) ) , repr ( dictionary ) ) )
else :
print ( ' oPDF.stream( %d , %d , %s , %s ) ' % ( objectId , object . version , repr ( object . Stream ( False , options . overridingfilters ) . rstrip ( ) ) , repr ( re . sub ( ' /Length \ s+ \ d+ ' , ' /Length %d ' , FormatOutput ( dataPrecedingStream , True ) ) . strip ( ) ) ) )
else :
print ( ' oPDF.indirectobject( %d , %d , %s ) ' % ( objectId , object . version , repr ( FormatOutput ( object . content , True ) . strip ( ) ) ) )
def PrintObject ( object , options ) :
if options . generate :
PrintGenerateObject ( object , options )
else :
PrintOutputObject ( object , options )
def File2Strings ( filename ) :
try :
f = open ( filename , ' r ' )
except :
return None
try :
return map ( lambda line : line . rstrip ( ' \n ' ) , f . readlines ( ) )
except :
return None
finally :
f . close ( )
def ProcessAt ( argument ) :
if argument . startswith ( ' @ ' ) :
strings = File2Strings ( argument [ 1 : ] )
if strings == None :
raise Exception ( ' Error reading %s ' % argument )
else :
return strings
else :
return [ argument ]
def YARACompile ( ruledata ) :
if ruledata . startswith ( ' # ' ) :
if ruledata . startswith ( ' #h# ' ) :
rule = binascii . a2b_hex ( ruledata [ 3 : ] )
elif ruledata . startswith ( ' #b# ' ) :
rule = binascii . a2b_base64 ( ruledata [ 3 : ] )
elif ruledata . startswith ( ' #s# ' ) :
rule = ' rule string { strings: $a = " %s " ascii wide nocase condition: $a} ' % ruledata [ 3 : ]
elif ruledata . startswith ( ' #q# ' ) :
rule = ruledata [ 3 : ] . replace ( " ' " , ' " ' )
else :
rule = ruledata [ 1 : ]
return yara . compile ( source = rule )
else :
dFilepaths = { }
if os . path . isdir ( ruledata ) :
for root , dirs , files in os . walk ( ruledata ) :
for file in files :
filename = os . path . join ( root , file )
dFilepaths [ filename ] = filename
else :
for filename in ProcessAt ( ruledata ) :
dFilepaths [ filename ] = filename
return yara . compile ( filepaths = dFilepaths )
def AddDecoder ( cClass ) :
global decoders
decoders . append ( cClass )
class cDecoderParent ( ) :
pass
def GetScriptPath ( ) :
if getattr ( sys , ' frozen ' , False ) :
return os . path . dirname ( sys . executable )
else :
return os . path . dirname ( sys . argv [ 0 ] )
def LoadDecoders ( decoders , verbose ) :
if decoders == ' ' :
return
scriptPath = GetScriptPath ( )
for decoder in sum ( map ( ProcessAt , decoders . split ( ' , ' ) ) , [ ] ) :
try :
if not decoder . lower ( ) . endswith ( ' .py ' ) :
decoder + = ' .py '
if os . path . dirname ( decoder ) == ' ' :
if not os . path . exists ( decoder ) :
scriptDecoder = os . path . join ( scriptPath , decoder )
if os . path . exists ( scriptDecoder ) :
decoder = scriptDecoder
exec ( open ( decoder , ' r ' ) . read ( ) , globals ( ) , globals ( ) )
except Exception as e :
print ( ' Error loading decoder: %s ' % decoder )
if verbose :
raise e
class cIdentity ( cDecoderParent ) :
name = ' Identity function decoder '
def __init__ ( self , stream , options ) :
self . stream = stream
self . options = options
self . available = True
def Available ( self ) :
return self . available
def Decode ( self ) :
self . available = False
return self . stream
def Name ( self ) :
return ' '
def DecodeFunction ( decoders , options , stream ) :
if decoders == [ ] :
return stream
return decoders [ 0 ] ( stream , options . decoderoptions ) . Decode ( )
class cDumpStream ( ) :
def __init__ ( self ) :
self . text = ' '
def Addline ( self , line ) :
if line != ' ' :
self . text + = line + ' \n '
def Content ( self ) :
return self . text
def HexDump ( data ) :
oDumpStream = cDumpStream ( )
hexDump = ' '
for i , b in enumerate ( data ) :
if i % dumplinelength == 0 and hexDump != ' ' :
oDumpStream . Addline ( hexDump )
hexDump = ' '
hexDump + = IFF ( hexDump == ' ' , ' ' , ' ' ) + ' %02X ' % ord ( b )
oDumpStream . Addline ( hexDump )
return oDumpStream . Content ( )
def CombineHexAscii ( hexDump , asciiDump ) :
if hexDump == ' ' :
return ' '
return hexDump + ' ' + ( ' ' * ( 3 * ( dumplinelength - len ( asciiDump ) ) ) ) + asciiDump
def HexAsciiDump ( data ) :
oDumpStream = cDumpStream ( )
hexDump = ' '
asciiDump = ' '
for i , b in enumerate ( data ) :
if i % dumplinelength == 0 :
if hexDump != ' ' :
oDumpStream . Addline ( CombineHexAscii ( hexDump , asciiDump ) )
hexDump = ' %08X : ' % i
asciiDump = ' '
hexDump + = ' %02X ' % ord ( b )
asciiDump + = IFF ( ord ( b ) > = 32 , b , ' . ' )
oDumpStream . Addline ( CombineHexAscii ( hexDump , asciiDump ) )
return oDumpStream . Content ( )
def HexAsciiDumpLine ( data ) :
return HexAsciiDump ( data [ 0 : 16 ] ) [ 10 : - 1 ]
def ParseINIFile ( ) :
oConfigParser = ConfigParser . ConfigParser ( allow_no_value = True )
oConfigParser . optionxform = str
oConfigParser . read ( os . path . join ( GetScriptPath ( ) , ' pdfid.ini ' ) )
keywords = [ ]
if oConfigParser . has_section ( ' keywords ' ) :
for key , value in oConfigParser . items ( ' keywords ' ) :
if not key in keywords :
keywords . append ( key )
return keywords
def MatchObjectID ( id , selection ) :
return str ( id ) in selection . split ( ' , ' )
def GetArguments ( ) :
arguments = sys . argv [ 1 : ]
envvar = os . getenv ( ' PDFPARSER_OPTIONS ' )
if envvar == None :
return arguments
return envvar . split ( ' ' ) + arguments
2015-03-15 12:25:47 +00:00
def Main ( ) :
""" pdf-parser, use it to parse a PDF document
"""
2020-05-03 12:47:01 +00:00
global decoders
2015-03-15 12:25:47 +00:00
oParser = optparse . OptionParser ( usage = ' usage: % prog [options] pdf-file|zip-file|url \n ' + __description__ , version = ' % prog ' + __version__ )
2020-05-03 12:47:01 +00:00
oParser . add_option ( ' -m ' , ' --man ' , action = ' store_true ' , default = False , help = ' Print manual ' )
2015-03-15 12:25:47 +00:00
oParser . add_option ( ' -s ' , ' --search ' , help = ' string to search in indirect objects (except streams) ' )
oParser . add_option ( ' -f ' , ' --filter ' , action = ' store_true ' , default = False , help = ' pass stream object through filters (FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode and RunLengthDecode only) ' )
2020-05-03 12:47:01 +00:00
oParser . add_option ( ' -o ' , ' --object ' , help = ' id(s) of indirect object(s) to select, use comma (,) to separate ids (version independent) ' )
2015-03-15 12:25:47 +00:00
oParser . add_option ( ' -r ' , ' --reference ' , help = ' id of indirect object being referenced (version independent) ' )
oParser . add_option ( ' -e ' , ' --elements ' , help = ' type of elements to select (cxtsi) ' )
oParser . add_option ( ' -w ' , ' --raw ' , action = ' store_true ' , default = False , help = ' raw output for data and filters ' )
oParser . add_option ( ' -a ' , ' --stats ' , action = ' store_true ' , default = False , help = ' display stats for pdf document ' )
oParser . add_option ( ' -t ' , ' --type ' , help = ' type of indirect object to select ' )
2020-05-03 12:47:01 +00:00
oParser . add_option ( ' -O ' , ' --objstm ' , action = ' store_true ' , default = False , help = ' parse stream of /ObjStm objects ' )
2015-03-15 12:25:47 +00:00
oParser . add_option ( ' -v ' , ' --verbose ' , action = ' store_true ' , default = False , help = ' display malformed PDF elements ' )
oParser . add_option ( ' -x ' , ' --extract ' , help = ' filename to extract malformed content to ' )
oParser . add_option ( ' -H ' , ' --hash ' , action = ' store_true ' , default = False , help = ' display hash of objects ' )
oParser . add_option ( ' -n ' , ' --nocanonicalizedoutput ' , action = ' store_true ' , default = False , help = ' do not canonicalize the output ' )
oParser . add_option ( ' -d ' , ' --dump ' , help = ' filename to dump stream content to ' )
oParser . add_option ( ' -D ' , ' --debug ' , action = ' store_true ' , default = False , help = ' display debug info ' )
oParser . add_option ( ' -c ' , ' --content ' , action = ' store_true ' , default = False , help = ' display the content for objects without streams or with streams without filters ' )
oParser . add_option ( ' --searchstream ' , help = ' string to search in streams ' )
oParser . add_option ( ' --unfiltered ' , action = ' store_true ' , default = False , help = ' search in unfiltered streams ' )
oParser . add_option ( ' --casesensitive ' , action = ' store_true ' , default = False , help = ' case sensitive search in streams ' )
oParser . add_option ( ' --regex ' , action = ' store_true ' , default = False , help = ' use regex to search in streams ' )
2020-05-03 12:47:01 +00:00
oParser . add_option ( ' --overridingfilters ' , type = str , default = ' ' , help = ' override filters with given filters (use raw for the raw stream content) ' )
oParser . add_option ( ' -g ' , ' --generate ' , action = ' store_true ' , default = False , help = ' generate a Python program that creates the parsed PDF file ' )
oParser . add_option ( ' --generateembedded ' , type = int , default = 0 , help = ' generate a Python program that embeds the selected indirect object as a file ' )
oParser . add_option ( ' -y ' , ' --yara ' , help = ' YARA rule (or directory or @file) to check streams (can be used with option --unfiltered) ' )
oParser . add_option ( ' --yarastrings ' , action = ' store_true ' , default = False , help = ' Print YARA strings ' )
oParser . add_option ( ' --decoders ' , type = str , default = ' ' , help = ' decoders to load (separate decoders with a comma , ; @file supported) ' )
oParser . add_option ( ' --decoderoptions ' , type = str , default = ' ' , help = ' options for the decoder ' )
oParser . add_option ( ' -k ' , ' --key ' , help = ' key to search in dictionaries ' )
( options , args ) = oParser . parse_args ( GetArguments ( ) )
if options . man :
oParser . print_help ( )
PrintManual ( )
return 0
2015-03-15 12:25:47 +00:00
if len ( args ) != 1 :
oParser . print_help ( )
print ( ' ' )
print ( ' %s ' % __description__ )
print ( ' Source code put in the public domain by Didier Stevens, no Copyright ' )
print ( ' Use at your own risk ' )
print ( ' https://DidierStevens.com ' )
else :
2020-05-03 12:47:01 +00:00
decoders = [ ]
LoadDecoders ( options . decoders , True )
2015-03-15 12:25:47 +00:00
oPDFParser = cPDFParser ( args [ 0 ] , options . verbose , options . extract )
cntComment = 0
cntXref = 0
cntTrailer = 0
cntStartXref = 0
cntIndirectObject = 0
dicObjectTypes = { }
2020-05-03 12:47:01 +00:00
keywords = [ ' /JS ' , ' /JavaScript ' , ' /AA ' , ' /OpenAction ' , ' /AcroForm ' , ' /RichMedia ' , ' /Launch ' , ' /EmbeddedFile ' , ' /XFA ' , ' /URI ' ]
for extrakeyword in ParseINIFile ( ) :
if not extrakeyword in keywords :
keywords . append ( extrakeyword )
# dKeywords = {keyword: [] for keyword in keywords}
# Done for compatibility with 2.6.6
dKeywords = { }
for keyword in keywords :
dKeywords [ keyword ] = [ ]
2015-03-15 12:25:47 +00:00
selectComment = False
selectXref = False
selectTrailer = False
selectStartXref = False
selectIndirectObject = False
if options . elements :
for c in options . elements :
if c == ' c ' :
selectComment = True
elif c == ' x ' :
selectXref = True
elif c == ' t ' :
selectTrailer = True
elif c == ' s ' :
selectStartXref = True
elif c == ' i ' :
selectIndirectObject = True
else :
print ( ' Error: unknown --elements value %s ' % c )
return
else :
selectIndirectObject = True
2020-05-03 12:47:01 +00:00
if not options . search and not options . object and not options . reference and not options . type and not options . searchstream and not options . key :
2015-03-15 12:25:47 +00:00
selectComment = True
selectXref = True
selectTrailer = True
selectStartXref = True
2020-05-03 12:47:01 +00:00
if options . search or options . key or options . reference :
selectTrailer = True
2015-03-15 12:25:47 +00:00
if options . type == ' - ' :
optionsType = ' '
else :
optionsType = options . type
2020-05-03 12:47:01 +00:00
if options . generate or options . generateembedded != 0 :
savedRoot = [ ' 1 ' , ' 0 ' , ' R ' ]
print ( ' #!/usr/bin/python ' )
print ( ' ' )
print ( ' " " " ' )
print ( ' ' )
print ( ' Program generated by pdf-parser.py by Didier Stevens ' )
print ( ' https://DidierStevens.com ' )
print ( ' Use at your own risk ' )
print ( ' ' )
print ( ' Input PDF file: %s ' % args [ 0 ] )
print ( ' This Python program was created on: %s ' % Timestamp ( ) )
print ( ' ' )
print ( ' " " " ' )
print ( ' ' )
print ( ' import mPDF ' )
print ( ' import sys ' )
print ( ' ' )
print ( ' def Main(): ' )
print ( ' if len(sys.argv) != 2: ' )
print ( " print( ' Usage: %s pdf-file ' % s ys.argv[0]) " )
print ( ' return ' )
print ( ' oPDF = mPDF.cPDF(sys.argv[1]) ' )
if options . generateembedded != 0 :
print ( " oPDF.header( ' 1.1 ' ) " )
print ( r " oPDF.comment( ' \ xd0 \ xd0 \ xd0 \ xd0 ' ) " )
print ( r " oPDF.indirectobject(1, 0, ' << \ r \ n /Type /Catalog \ r \ n /Outlines 2 0 R \ r \ n /Pages 3 0 R \ r \ n /Names << /EmbeddedFiles << /Names [(test.bin) 7 0 R] >> >> \ r \ n>> ' ) " )
print ( r " oPDF.indirectobject(2, 0, ' << \ r \ n /Type /Outlines \ r \ n /Count 0 \ r \ n>> ' ) " )
print ( r " oPDF.indirectobject(3, 0, ' << \ r \ n /Type /Pages \ r \ n /Kids [4 0 R] \ r \ n /Count 1 \ r \ n>> ' ) " )
print ( r " oPDF.indirectobject(4, 0, ' << \ r \ n /Type /Page \ r \ n /Parent 3 0 R \ r \ n /MediaBox [0 0 612 792] \ r \ n /Contents 5 0 R \ r \ n /Resources << \ r \ n /ProcSet [/PDF /Text] \ r \ n /Font << /F1 6 0 R >> \ r \ n >> \ r \ n>> ' ) " )
print ( r " oPDF.stream(5, 0, ' BT /F1 12 Tf 70 700 Td 15 TL (This PDF document embeds file test.bin) Tj ET ' , ' << /Length %d >> ' ) " )
print ( r " oPDF.indirectobject(6, 0, ' << \ r \ n /Type /Font \ r \ n /Subtype /Type1 \ r \ n /Name /F1 \ r \ n /BaseFont /Helvetica \ r \ n /Encoding /MacRomanEncoding \ r \ n>> ' ) " )
print ( r " oPDF.indirectobject(7, 0, ' << \ r \ n /Type /Filespec \ r \ n /F (test.bin) \ r \ n /EF << /F 8 0 R >> \ r \ n>> ' ) " )
if options . yara != None :
if not ' yara ' in sys . modules :
print ( ' Error: option yara requires the YARA Python module. ' )
return
rules = YARACompile ( options . yara )
oPDFParserOBJSTM = None
2015-03-15 12:25:47 +00:00
while True :
2020-05-03 12:47:01 +00:00
if oPDFParserOBJSTM == None :
object = oPDFParser . GetObject ( )
else :
object = oPDFParserOBJSTM . GetObject ( )
if object == None :
oPDFParserOBJSTM = None
object = oPDFParser . GetObject ( )
if options . objstm and hasattr ( object , ' GetType ' ) and EqualCanonical ( object . GetType ( ) , ' /ObjStm ' ) and object . ContainsStream ( ) :
# parsing objects inside an /ObjStm object by extracting & parsing the stream content to create a synthesized PDF document, that is then parsed by cPDFParser
oPDFParseDictionary = cPDFParseDictionary ( object . ContainsStream ( ) , options . nocanonicalizedoutput )
numberOfObjects = int ( oPDFParseDictionary . Get ( ' /N ' ) [ 0 ] )
offsetFirstObject = int ( oPDFParseDictionary . Get ( ' /First ' ) [ 0 ] )
indexes = list ( map ( int , C2SIP3 ( object . Stream ( ) ) [ : offsetFirstObject ] . strip ( ) . split ( ' ' ) ) )
if len ( indexes ) % 2 != 0 or len ( indexes ) / 2 != numberOfObjects :
raise Exception ( ' Error in index of /ObjStm stream ' )
streamObject = C2SIP3 ( object . Stream ( ) [ offsetFirstObject : ] )
synthesizedPDF = ' '
while len ( indexes ) > 0 :
objectNumber = indexes [ 0 ]
offset = indexes [ 1 ]
indexes = indexes [ 2 : ]
if len ( indexes ) > = 2 :
offsetNextObject = indexes [ 1 ]
else :
offsetNextObject = len ( streamObject )
synthesizedPDF + = ' %d 0 obj \n %s \n endobj \n ' % ( objectNumber , streamObject [ offset : offsetNextObject ] )
oPDFParserOBJSTM = cPDFParser ( StringIO ( synthesizedPDF ) , options . verbose , options . extract , ( object . id , object . version ) )
2015-03-15 12:25:47 +00:00
if object != None :
if options . stats :
if object . type == PDF_ELEMENT_COMMENT :
cntComment + = 1
elif object . type == PDF_ELEMENT_XREF :
cntXref + = 1
elif object . type == PDF_ELEMENT_TRAILER :
cntTrailer + = 1
elif object . type == PDF_ELEMENT_STARTXREF :
cntStartXref + = 1
elif object . type == PDF_ELEMENT_INDIRECT_OBJECT :
cntIndirectObject + = 1
2020-05-03 12:47:01 +00:00
type1 = object . GetType ( )
if not type1 in dicObjectTypes :
dicObjectTypes [ type1 ] = [ object . id ]
2015-03-15 12:25:47 +00:00
else :
2020-05-03 12:47:01 +00:00
dicObjectTypes [ type1 ] . append ( object . id )
for keyword in dKeywords . keys ( ) :
if object . ContainsName ( keyword ) :
dKeywords [ keyword ] . append ( object . id )
2015-03-15 12:25:47 +00:00
else :
if object . type == PDF_ELEMENT_COMMENT and selectComment :
2020-05-03 12:47:01 +00:00
if options . generate :
comment = object . comment [ 1 : ] . rstrip ( )
if re . match ( ' PDF- \ d \ . \ d ' , comment ) :
print ( " oPDF.header( ' %s ' ) " % comment [ 4 : ] )
elif comment != ' %E OF ' :
print ( ' oPDF.comment( %s ) ' % repr ( comment ) )
elif options . yara == None and options . generateembedded == 0 :
print ( ' PDF Comment %s ' % FormatOutput ( object . comment , options . raw ) )
print ( ' ' )
2015-03-15 12:25:47 +00:00
elif object . type == PDF_ELEMENT_XREF and selectXref :
2020-05-03 12:47:01 +00:00
if not options . generate and options . yara == None and options . generateembedded == 0 :
if options . debug :
print ( ' xref %s ' % FormatOutput ( object . content , options . raw ) )
else :
print ( ' xref ' )
print ( ' ' )
2015-03-15 12:25:47 +00:00
elif object . type == PDF_ELEMENT_TRAILER and selectTrailer :
oPDFParseDictionary = cPDFParseDictionary ( object . content [ 1 : ] , options . nocanonicalizedoutput )
2020-05-03 12:47:01 +00:00
if options . generate :
result = oPDFParseDictionary . Get ( ' /Root ' )
if result != None :
savedRoot = result
elif options . yara == None and options . generateembedded == 0 :
if not options . search and not options . key and not options . reference or options . search and object . Contains ( options . search ) :
if oPDFParseDictionary == None :
print ( ' trailer %s ' % FormatOutput ( object . content , options . raw ) )
else :
print ( ' trailer ' )
oPDFParseDictionary . PrettyPrint ( ' ' )
print ( ' ' )
elif options . key :
if oPDFParseDictionary . parsed != None :
result = oPDFParseDictionary . GetNested ( options . key )
if result != None :
print ( result )
elif options . reference :
for key , value in oPDFParseDictionary . Retrieve ( ) :
if value == [ str ( options . reference ) , ' 0 ' , ' R ' ] :
print ( ' trailer ' )
oPDFParseDictionary . PrettyPrint ( ' ' )
2015-03-15 12:25:47 +00:00
elif object . type == PDF_ELEMENT_STARTXREF and selectStartXref :
2020-05-03 12:47:01 +00:00
if not options . generate and options . yara == None and options . generateembedded == 0 :
print ( ' startxref %d ' % object . index )
print ( ' ' )
2015-03-15 12:25:47 +00:00
elif object . type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject :
if options . search :
if object . Contains ( options . search ) :
PrintObject ( object , options )
2020-05-03 12:47:01 +00:00
elif options . key :
contentDictionary = object . ContainsStream ( )
if not contentDictionary :
contentDictionary = object . content [ 1 : ]
oPDFParseDictionary = cPDFParseDictionary ( contentDictionary , options . nocanonicalizedoutput )
if oPDFParseDictionary . parsed != None :
result = oPDFParseDictionary . GetNested ( options . key )
if result != None :
print ( result )
2015-03-15 12:25:47 +00:00
elif options . object :
2020-05-03 12:47:01 +00:00
if MatchObjectID ( object . id , options . object ) :
2015-03-15 12:25:47 +00:00
PrintObject ( object , options )
elif options . reference :
if object . References ( options . reference ) :
PrintObject ( object , options )
elif options . type :
if EqualCanonical ( object . GetType ( ) , optionsType ) :
PrintObject ( object , options )
elif options . hash :
print ( ' obj %d %d ' % ( object . id , object . version ) )
rawContent = FormatOutput ( object . content , True )
print ( ' len: %d md5: %s ' % ( len ( rawContent ) , hashlib . md5 ( rawContent ) . hexdigest ( ) ) )
print ( ' ' )
elif options . searchstream :
2020-05-03 12:47:01 +00:00
if object . StreamContains ( options . searchstream , not options . unfiltered , options . casesensitive , options . regex , options . overridingfilters ) :
2015-03-15 12:25:47 +00:00
PrintObject ( object , options )
2020-05-03 12:47:01 +00:00
elif options . yara != None :
results = object . StreamYARAMatch ( rules , decoders , options . decoderoptions , not options . unfiltered , options . overridingfilters )
if results != None and results != [ ] :
for result in results :
for yaraResult in result [ 1 ] :
print ( ' YARA rule %s : %s ( %s ) ' % ( IFF ( result [ 0 ] == ' ' , ' ' , ' (stream decoder: %s ) ' % result [ 0 ] ) , yaraResult . rule , yaraResult . namespace ) )
if options . yarastrings :
for stringdata in yaraResult . strings :
print ( ' %06x %s : ' % ( stringdata [ 0 ] , stringdata [ 1 ] ) )
print ( ' %s ' % binascii . hexlify ( C2BIP3 ( stringdata [ 2 ] ) ) )
print ( ' %s ' % repr ( stringdata [ 2 ] ) )
PrintObject ( object , options )
elif options . generateembedded != 0 :
if object . id == options . generateembedded :
PrintGenerateObject ( object , options , 8 )
2015-03-15 12:25:47 +00:00
else :
PrintObject ( object , options )
elif object . type == PDF_ELEMENT_MALFORMED :
try :
fExtract = open ( options . extract , ' wb ' )
try :
fExtract . write ( C2BIP3 ( object . content ) )
except :
print ( ' Error writing file %s ' % options . extract )
fExtract . close ( )
except :
print ( ' Error writing file %s ' % options . extract )
else :
break
if options . stats :
print ( ' Comment: %s ' % cntComment )
print ( ' XREF: %s ' % cntXref )
print ( ' Trailer: %s ' % cntTrailer )
print ( ' StartXref: %s ' % cntStartXref )
print ( ' Indirect object: %s ' % cntIndirectObject )
2020-05-03 12:47:01 +00:00
for key in sorted ( dicObjectTypes . keys ( ) ) :
2015-03-15 12:25:47 +00:00
print ( ' %s %d : %s ' % ( key , len ( dicObjectTypes [ key ] ) , ' , ' . join ( map ( lambda x : ' %d ' % x , dicObjectTypes [ key ] ) ) ) )
2020-05-03 12:47:01 +00:00
if sum ( map ( len , dKeywords . values ( ) ) ) > 0 :
print ( ' Search keywords: ' )
for keyword in keywords :
if len ( dKeywords [ keyword ] ) > 0 :
print ( ' %s %d : %s ' % ( keyword , len ( dKeywords [ keyword ] ) , ' , ' . join ( map ( lambda x : ' %d ' % x , dKeywords [ keyword ] ) ) ) )
if options . generate or options . generateembedded != 0 :
print ( " oPDF.xrefAndTrailer( ' %s ' ) " % ' ' . join ( savedRoot ) )
print ( ' ' )
print ( " if __name__ == ' __main__ ' : " )
print ( ' Main() ' )
2015-03-15 12:25:47 +00:00
def TestPythonVersion ( enforceMaximumVersion = False , enforceMinimumVersion = False ) :
if sys . version_info [ 0 : 3 ] > __maximum_python_version__ :
if enforceMaximumVersion :
print ( ' This program does not work with this version of Python ( %d . %d . %d ) ' % sys . version_info [ 0 : 3 ] )
print ( ' Please use Python version %d . %d . %d ' % __maximum_python_version__ )
sys . exit ( )
else :
print ( ' This program has not been tested with this version of Python ( %d . %d . %d ) ' % sys . version_info [ 0 : 3 ] )
print ( ' Should you encounter problems, please use Python version %d . %d . %d ' % __maximum_python_version__ )
if sys . version_info [ 0 : 3 ] < __minimum_python_version__ :
if enforceMinimumVersion :
print ( ' This program does not work with this version of Python ( %d . %d . %d ) ' % sys . version_info [ 0 : 3 ] )
print ( ' Please use Python version %d . %d . %d ' % __maximum_python_version__ )
sys . exit ( )
else :
print ( ' This program has not been tested with this version of Python ( %d . %d . %d ) ' % sys . version_info [ 0 : 3 ] )
print ( ' Should you encounter problems, please use Python version %d . %d . %d ' % __maximum_python_version__ )
if __name__ == ' __main__ ' :
TestPythonVersion ( )
Main ( )