scripts/pdf-parser.py

1670 lines
70 KiB
Python
Executable File

#!/usr/bin/python
__description__ = 'pdf-parser, use it to parse a PDF document'
__author__ = 'Didier Stevens'
__version__ = '0.7.4'
__date__ = '2019/11/05'
__minimum_python_version__ = (2, 5, 1)
__maximum_python_version__ = (3, 7, 5)
"""
Source code put in public domain by Didier Stevens, no Copyright
https://DidierStevens.com
Use at your own risk
History:
2008/05/02: continue
2008/05/03: continue
2008/06/02: streams
2008/10/19: refactor, grep & extract functionality
2008/10/20: reference
2008/10/21: cleanup
2008/11/12: V0.3 dictionary parser
2008/11/13: option elements
2008/11/14: continue
2009/05/05: added /ASCIIHexDecode support (thanks Justin Prosco)
2009/05/11: V0.3.1 updated usage, added --verbose and --extract
2009/07/16: V0.3.2 Added Canonicalize (thanks Justin Prosco)
2009/07/18: bugfix EqualCanonical
2009/07/24: V0.3.3 Added --hash option
2009/07/25: EqualCanonical for option --type, added option --nocanonicalizedoutput
2009/07/28: V0.3.4 Added ASCII85Decode support
2009/08/01: V0.3.5 Updated ASCIIHexDecode to support whitespace obfuscation
2009/08/30: V0.3.6 TestPythonVersion
2010/01/08: V0.3.7 Added RLE and LZW support (thanks pARODY); added dump option
2010/01/09: Fixed parsing of incomplete startxref
2010/09/22: V0.3.8 Changed dump option, updated PrettyPrint, added debug option
2011/12/17: fixed bugs empty objects
2012/03/11: V0.3.9 fixed bugs double nested [] in PrettyPrintSub (thanks kurt)
2013/01/11: V0.3.10 Extract and dump bug fixes by Priit; added content option
2013/02/16: Performance improvement in cPDFTokenizer by using StringIO for token building by Christophe Vandeplas; xrange replaced with range
2013/02/16: V0.4.0 added http/https support; added error handling for missing file or URL; ; added support for ZIP file with password 'infected'
2013/03/13: V0.4.1 fixes for Python 3
2013/04/11: V0.4.2 modified PrettyPrintSub for strings with unprintable characters
2013/05/04: Added options searchstream, unfiltered, casesensitive, regex
2013/09/18: V0.4.3 fixed regression bug -w option
2014/09/25: V0.5.0 added option -g
2014/09/29: Added PrintGenerateObject and PrintOutputObject
2014/12/05: V0.6.0 Added YARA support
2014/12/09: cleanup, refactoring
2014/12/13: Python 3 fixes
2015/01/11: Added support for multiple YARA rule files; added request to search in trailer
2015/01/31: V0.6.1 Added optionyarastrings
2015/02/09: Added decoders
2015/04/05: V0.6.2 Added generateembedded
2015/04/06: fixed bug reported by Kurt for stream produced by Ghostscript where endstream is not preceded by whitespace; fixed prettyprint bug
2015/04/24: V0.6.3 when option dump's filename is -, content is dumped to stdout
2015/08/12: V0.6.4 option hash now also calculates hashes of streams when selecting or searching objects; and displays hexasciidump first line
2016/07/27: V0.6.5 bugfix whitespace 0x00 0x0C after stream 0x0D 0x0A reported by @mr_me
2016/11/20: V0.6.6 added workaround zlib errors FlateDecode
2016/12/17: V0.6.7 added option -k
2017/01/07: V0.6.8 changed cPDFParseDictionary to handle strings () with % character
2017/10/28: fixed bug
2017/10/29: added # support for option -y
2018/06/29: V0.6.9 added option --overridingfilters
2018/10/20: added keywords to statistics
2019/02/22: V0.7.0 added option -O --objstm to parse the stream of /ObjStm objects, inspired by a contributor wishing anonymity
2019/03/01: V0.7.1 added ContainsName for correct keyword statistics (-a)
2019/04/12: V0.7.2 Python 2.6.6 compatibility fix
2019/07/30: bug fixes (including fixes Josef Hinteregger)
2019/09/26: V0.7.3 added multiple id selection to option -o; added man page (-m); added environment variable PDFPARSER_OPTIONS; bug fixes
2019/11/05: V0.7.4 fixed plugin path when compiled with pyinstaller, replaced eval with int
Todo:
- handle printf todo
- support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D.pdf.vir
"""
import re
import optparse
import zlib
import binascii
import hashlib
import sys
import zipfile
import time
import os
import textwrap
if sys.version_info[0] >= 3:
from io import StringIO
import urllib.request
urllib23 = urllib.request
import configparser as ConfigParser
else:
from cStringIO import StringIO
import urllib2
urllib23 = urllib2
import ConfigParser
try:
import yara
except:
pass
CHAR_WHITESPACE = 1
CHAR_DELIMITER = 2
CHAR_REGULAR = 3
CONTEXT_NONE = 1
CONTEXT_OBJ = 2
CONTEXT_XREF = 3
CONTEXT_TRAILER = 4
PDF_ELEMENT_COMMENT = 1
PDF_ELEMENT_INDIRECT_OBJECT = 2
PDF_ELEMENT_XREF = 3
PDF_ELEMENT_TRAILER = 4
PDF_ELEMENT_STARTXREF = 5
PDF_ELEMENT_MALFORMED = 6
dumplinelength = 16
def PrintManual():
manual = '''
Manual:
This manual is a work in progress.
There is a free PDF analysis book:
https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/
Option -o is used to select objects by id. Provide a single id or multiple ids separated by a comma (,).
When environment variable PDFPARSER_OPTIONS is defined, the options it defines are added implicitely to the command line arguments.
Use this to define options you want included with each use of pdf-parser.py.
Like option -O, to parse stream objects (/ObjStm).
By defining PDFPARSER_OPTIONS=-O, pdf-parser will always parse stream objects (when found).
PS: this feature is experimental.
'''
for line in manual.split('\n'):
print(textwrap.fill(line))
#Convert 2 Bytes If Python 3
def C2BIP3(string):
if sys.version_info[0] > 2:
if type(string) == bytes:
return string
else:
return bytes([ord(x) for x in string])
else:
return string
#Convert 2 String If Python 3
def C2SIP3(bytes):
if sys.version_info[0] > 2:
return ''.join([chr(byte) for byte in bytes])
else:
return bytes
# CIC: Call If Callable
def CIC(expression):
if callable(expression):
return expression()
else:
return expression
# IFF: IF Function
def IFF(expression, valueTrue, valueFalse):
if expression:
return CIC(valueTrue)
else:
return CIC(valueFalse)
def Timestamp(epoch=None):
if epoch == None:
localTime = time.localtime()
else:
localTime = time.localtime(epoch)
return '%04d%02d%02d-%02d%02d%02d' % localTime[0:6]
def CopyWithoutWhiteSpace(content):
result = []
for token in content:
if token[0] != CHAR_WHITESPACE:
result.append(token)
return result
def Obj2Str(content):
return ''.join(map(lambda x: repr(x[1])[1:-1], CopyWithoutWhiteSpace(content)))
class cPDFDocument:
def __init__(self, file):
self.file = file
if type(file) != str:
self.infile = file
elif file.lower().startswith('http://') or file.lower().startswith('https://'):
try:
if sys.hexversion >= 0x020601F0:
self.infile = urllib23.urlopen(file, timeout=5)
else:
self.infile = urllib23.urlopen(file)
except urllib23.HTTPError:
print('Error accessing URL %s' % file)
print(sys.exc_info()[1])
sys.exit()
elif file.lower().endswith('.zip'):
try:
self.zipfile = zipfile.ZipFile(file, 'r')
self.infile = self.zipfile.open(self.zipfile.infolist()[0], 'r', C2BIP3('infected'))
except:
print('Error opening file %s' % file)
print(sys.exc_info()[1])
sys.exit()
else:
try:
self.infile = open(file, 'rb')
except:
print('Error opening file %s' % file)
print(sys.exc_info()[1])
sys.exit()
self.ungetted = []
self.position = -1
def byte(self):
if len(self.ungetted) != 0:
self.position += 1
return self.ungetted.pop()
inbyte = self.infile.read(1)
if not inbyte or inbyte == '':
self.infile.close()
return None
self.position += 1
return ord(inbyte)
def unget(self, byte):
self.position -= 1
self.ungetted.append(byte)
def CharacterClass(byte):
if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32:
return CHAR_WHITESPACE
if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25:
return CHAR_DELIMITER
return CHAR_REGULAR
def IsNumeric(str):
return re.match('^[0-9]+', str)
class cPDFTokenizer:
def __init__(self, file):
self.oPDF = cPDFDocument(file)
self.ungetted = []
def Token(self):
if len(self.ungetted) != 0:
return self.ungetted.pop()
if self.oPDF == None:
return None
self.byte = self.oPDF.byte()
if self.byte == None:
self.oPDF = None
return None
elif CharacterClass(self.byte) == CHAR_WHITESPACE:
file_str = StringIO()
while self.byte != None and CharacterClass(self.byte) == CHAR_WHITESPACE:
file_str.write(chr(self.byte))
self.byte = self.oPDF.byte()
if self.byte != None:
self.oPDF.unget(self.byte)
else:
self.oPDF = None
self.token = file_str.getvalue()
return (CHAR_WHITESPACE, self.token)
elif CharacterClass(self.byte) == CHAR_REGULAR:
file_str = StringIO()
while self.byte != None and CharacterClass(self.byte) == CHAR_REGULAR:
file_str.write(chr(self.byte))
self.byte = self.oPDF.byte()
if self.byte != None:
self.oPDF.unget(self.byte)
else:
self.oPDF = None
self.token = file_str.getvalue()
return (CHAR_REGULAR, self.token)
else:
if self.byte == 0x3C:
self.byte = self.oPDF.byte()
if self.byte == 0x3C:
return (CHAR_DELIMITER, '<<')
else:
self.oPDF.unget(self.byte)
return (CHAR_DELIMITER, '<')
elif self.byte == 0x3E:
self.byte = self.oPDF.byte()
if self.byte == 0x3E:
return (CHAR_DELIMITER, '>>')
else:
self.oPDF.unget(self.byte)
return (CHAR_DELIMITER, '>')
elif self.byte == 0x25:
file_str = StringIO()
while self.byte != None:
file_str.write(chr(self.byte))
if self.byte == 10 or self.byte == 13:
self.byte = self.oPDF.byte()
break
self.byte = self.oPDF.byte()
if self.byte != None:
if self.byte == 10:
file_str.write(chr(self.byte))
else:
self.oPDF.unget(self.byte)
else:
self.oPDF = None
self.token = file_str.getvalue()
return (CHAR_DELIMITER, self.token)
return (CHAR_DELIMITER, chr(self.byte))
def TokenIgnoreWhiteSpace(self):
token = self.Token()
while token != None and token[0] == CHAR_WHITESPACE:
token = self.Token()
return token
def Tokens(self):
tokens = []
token = self.Token()
while token != None:
tokens.append(token)
token = self.Token()
return tokens
def unget(self, byte):
self.ungetted.append(byte)
class cPDFParser:
def __init__(self, file, verbose=False, extract=None, objstm=None):
self.context = CONTEXT_NONE
self.content = []
self.oPDFTokenizer = cPDFTokenizer(file)
self.verbose = verbose
self.extract = extract
self.objstm = objstm
def GetObject(self):
while True:
if self.context == CONTEXT_OBJ:
self.token = self.oPDFTokenizer.Token()
else:
self.token = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
if self.token:
if self.token[0] == CHAR_DELIMITER:
if self.token[1][0] == '%':
if self.context == CONTEXT_OBJ:
self.content.append(self.token)
else:
return cPDFElementComment(self.token[1])
elif self.token[1] == '/':
self.token2 = self.oPDFTokenizer.Token()
if self.token2[0] == CHAR_REGULAR:
if self.context != CONTEXT_NONE:
self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1]))
elif self.verbose:
print('todo 1: %s' % (self.token[1] + self.token2[1]))
else:
self.oPDFTokenizer.unget(self.token2)
if self.context != CONTEXT_NONE:
self.content.append(self.token)
elif self.verbose:
print('todo 2: %d %s' % (self.token[0], repr(self.token[1])))
elif self.context != CONTEXT_NONE:
self.content.append(self.token)
elif self.verbose:
print('todo 3: %d %s' % (self.token[0], repr(self.token[1])))
elif self.token[0] == CHAR_WHITESPACE:
if self.context != CONTEXT_NONE:
self.content.append(self.token)
elif self.verbose:
print('todo 4: %d %s' % (self.token[0], repr(self.token[1])))
else:
if self.context == CONTEXT_OBJ:
if self.token[1] == 'endobj':
self.oPDFElementIndirectObject = cPDFElementIndirectObject(self.objectId, self.objectVersion, self.content, self.objstm)
self.context = CONTEXT_NONE
self.content = []
return self.oPDFElementIndirectObject
else:
self.content.append(self.token)
elif self.context == CONTEXT_TRAILER:
if self.token[1] == 'startxref' or self.token[1] == 'xref':
self.oPDFElementTrailer = cPDFElementTrailer(self.content)
self.oPDFTokenizer.unget(self.token)
self.context = CONTEXT_NONE
self.content = []
return self.oPDFElementTrailer
else:
self.content.append(self.token)
elif self.context == CONTEXT_XREF:
if self.token[1] == 'trailer' or self.token[1] == 'xref':
self.oPDFElementXref = cPDFElementXref(self.content)
self.oPDFTokenizer.unget(self.token)
self.context = CONTEXT_NONE
self.content = []
return self.oPDFElementXref
else:
self.content.append(self.token)
else:
if IsNumeric(self.token[1]):
self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
if IsNumeric(self.token2[1]):
self.token3 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
if self.token3[1] == 'obj':
self.objectId = int(self.token[1], 10)
self.objectVersion = int(self.token2[1], 10)
self.context = CONTEXT_OBJ
else:
self.oPDFTokenizer.unget(self.token3)
self.oPDFTokenizer.unget(self.token2)
if self.verbose:
print('todo 6: %d %s' % (self.token[0], repr(self.token[1])))
else:
self.oPDFTokenizer.unget(self.token2)
if self.verbose:
print('todo 7: %d %s' % (self.token[0], repr(self.token[1])))
elif self.token[1] == 'trailer':
self.context = CONTEXT_TRAILER
self.content = [self.token]
elif self.token[1] == 'xref':
self.context = CONTEXT_XREF
self.content = [self.token]
elif self.token[1] == 'startxref':
self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
if self.token2 and IsNumeric(self.token2[1]):
return cPDFElementStartxref(int(self.token2[1], 10))
else:
self.oPDFTokenizer.unget(self.token2)
if self.verbose:
print('todo 9: %d %s' % (self.token[0], repr(self.token[1])))
elif self.extract:
self.bytes = ''
while self.token:
self.bytes += self.token[1]
self.token = self.oPDFTokenizer.Token()
return cPDFElementMalformed(self.bytes)
elif self.verbose:
print('todo 10: %d %s' % (self.token[0], repr(self.token[1])))
else:
break
class cPDFElementComment:
def __init__(self, comment):
self.type = PDF_ELEMENT_COMMENT
self.comment = comment
# if re.match('^%PDF-[0-9]\.[0-9]', self.token[1]):
# print(repr(self.token[1]))
# elif re.match('^%%EOF', self.token[1]):
# print(repr(self.token[1]))
class cPDFElementXref:
def __init__(self, content):
self.type = PDF_ELEMENT_XREF
self.content = content
class cPDFElementTrailer:
def __init__(self, content):
self.type = PDF_ELEMENT_TRAILER
self.content = content
def Contains(self, keyword):
data = ''
for i in range(0, len(self.content)):
if self.content[i][1] == 'stream':
break
else:
data += Canonicalize(self.content[i][1])
return data.upper().find(keyword.upper()) != -1
def IIf(expr, truepart, falsepart):
if expr:
return truepart
else:
return falsepart
class cPDFElementIndirectObject:
def __init__(self, id, version, content, objstm=None):
self.type = PDF_ELEMENT_INDIRECT_OBJECT
self.id = id
self.version = version
self.content = content
self.objstm = objstm
#fix stream for Ghostscript bug reported by Kurt
if self.ContainsStream():
position = len(self.content) - 1
if position < 0:
return
while self.content[position][0] == CHAR_WHITESPACE and position >= 0:
position -= 1
if position < 0:
return
if self.content[position][0] != CHAR_REGULAR:
return
if self.content[position][1] == 'endstream':
return
if not self.content[position][1].endswith('endstream'):
return
self.content = self.content[0:position] + [(self.content[position][0], self.content[position][1][:-len('endstream')])] + [(self.content[position][0], 'endstream')] + self.content[position+1:]
def GetType(self):
content = CopyWithoutWhiteSpace(self.content)
dictionary = 0
for i in range(0, len(content)):
if content[i][0] == CHAR_DELIMITER and content[i][1] == '<<':
dictionary += 1
if content[i][0] == CHAR_DELIMITER and content[i][1] == '>>':
dictionary -= 1
if dictionary == 1 and content[i][0] == CHAR_DELIMITER and EqualCanonical(content[i][1], '/Type') and i < len(content) - 1:
return content[i+1][1]
return ''
def GetReferences(self):
content = CopyWithoutWhiteSpace(self.content)
references = []
for i in range(0, len(content)):
if i > 1 and content[i][0] == CHAR_REGULAR and content[i][1] == 'R' and content[i-2][0] == CHAR_REGULAR and IsNumeric(content[i-2][1]) and content[i-1][0] == CHAR_REGULAR and IsNumeric(content[i-1][1]):
references.append((content[i-2][1], content[i-1][1], content[i][1]))
return references
def References(self, index):
for ref in self.GetReferences():
if ref[0] == index:
return True
return False
def ContainsStream(self):
for i in range(0, len(self.content)):
if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
return self.content[0:i]
return False
def Contains(self, keyword):
data = ''
for i in range(0, len(self.content)):
if self.content[i][1] == 'stream':
break
else:
data += Canonicalize(self.content[i][1])
return data.upper().find(keyword.upper()) != -1
def ContainsName(self, keyword):
for token in self.content:
if token[1] == 'stream':
return False
if token[0] == CHAR_DELIMITER and EqualCanonical(token[1], keyword):
return True
return False
def StreamContains(self, keyword, filter, casesensitive, regex, overridingfilters):
if not self.ContainsStream():
return False
streamData = self.Stream(filter, overridingfilters)
if filter and streamData == 'No filters':
streamData = self.Stream(False, overridingfilters)
if regex:
return re.search(keyword, streamData, IIf(casesensitive, 0, re.I))
elif casesensitive:
return keyword in streamData
else:
return keyword.lower() in streamData.lower()
def Stream(self, filter=True, overridingfilters=''):
state = 'start'
countDirectories = 0
data = ''
filters = []
for i in range(0, len(self.content)):
if state == 'start':
if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '<<':
countDirectories += 1
if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '>>':
countDirectories -= 1
if countDirectories == 1 and self.content[i][0] == CHAR_DELIMITER and EqualCanonical(self.content[i][1], '/Filter'):
state = 'filter'
elif countDirectories == 0 and self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
state = 'stream-whitespace'
elif state == 'filter':
if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/':
filters = [self.content[i][1]]
state = 'search-stream'
elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '[':
state = 'filter-list'
elif state == 'filter-list':
if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/':
filters.append(self.content[i][1])
elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == ']':
state = 'search-stream'
elif state == 'search-stream':
if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
state = 'stream-whitespace'
elif state == 'stream-whitespace':
if self.content[i][0] == CHAR_WHITESPACE:
whitespace = self.content[i][1]
if whitespace.startswith('\x0D\x0A') and len(whitespace) > 2:
data += whitespace[2:]
elif whitespace.startswith('\x0A') and len(whitespace) > 1:
data += whitespace[1:]
else:
data += self.content[i][1]
state = 'stream-concat'
elif state == 'stream-concat':
if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'endstream':
if filter:
if overridingfilters == '':
return self.Decompress(data, filters)
elif overridingfilters == 'raw':
return data
else:
return self.Decompress(data, overridingfilters.split(' '))
else:
return data
else:
data += self.content[i][1]
else:
return 'Unexpected filter state'
return filters
def Decompress(self, data, filters):
for filter in filters:
if EqualCanonical(filter, '/FlateDecode') or EqualCanonical(filter, '/Fl'):
try:
data = FlateDecode(data)
except zlib.error as e:
message = 'FlateDecode decompress failed'
if len(data) > 0 and ord(data[0]) & 0x0F != 8:
message += ', unexpected compression method: %02x' % ord(data[0])
return message + '. zlib.error %s' % e.message
elif EqualCanonical(filter, '/ASCIIHexDecode') or EqualCanonical(filter, '/AHx'):
try:
data = ASCIIHexDecode(data)
except:
return 'ASCIIHexDecode decompress failed'
elif EqualCanonical(filter, '/ASCII85Decode') or EqualCanonical(filter, '/A85'):
try:
data = ASCII85Decode(data.rstrip('>'))
except:
return 'ASCII85Decode decompress failed'
elif EqualCanonical(filter, '/LZWDecode') or EqualCanonical(filter, '/LZW'):
try:
data = LZWDecode(data)
except:
return 'LZWDecode decompress failed'
elif EqualCanonical(filter, '/RunLengthDecode') or EqualCanonical(filter, '/R'):
try:
data = RunLengthDecode(data)
except:
return 'RunLengthDecode decompress failed'
# elif i.startswith('/CC') # CCITTFaxDecode
# elif i.startswith('/DCT') # DCTDecode
else:
return 'Unsupported filter: %s' % repr(filters)
if len(filters) == 0:
return 'No filters'
else:
return data
def StreamYARAMatch(self, rules, decoders, decoderoptions, filter, overridingfilters):
if not self.ContainsStream():
return None
streamData = self.Stream(filter, overridingfilters)
if filter and streamData == 'No filters':
streamData = self.Stream(False, overridingfilters)
oDecoders = [cIdentity(streamData, None)]
for cDecoder in decoders:
try:
oDecoder = cDecoder(streamData, decoderoptions)
oDecoders.append(oDecoder)
except Exception as e:
print('Error instantiating decoder: %s' % cDecoder.name)
raise e
results = []
for oDecoder in oDecoders:
while oDecoder.Available():
yaraResults = rules.match(data=oDecoder.Decode())
if yaraResults != []:
results.append([oDecoder.Name(), yaraResults])
return results
class cPDFElementStartxref:
def __init__(self, index):
self.type = PDF_ELEMENT_STARTXREF
self.index = index
class cPDFElementMalformed:
def __init__(self, content):
self.type = PDF_ELEMENT_MALFORMED
self.content = content
def TrimLWhiteSpace(data):
while data != [] and data[0][0] == CHAR_WHITESPACE:
data = data[1:]
return data
def TrimRWhiteSpace(data):
while data != [] and data[-1][0] == CHAR_WHITESPACE:
data = data[:-1]
return data
class cPDFParseDictionary:
def __init__(self, content, nocanonicalizedoutput):
self.content = content
self.nocanonicalizedoutput = nocanonicalizedoutput
dataTrimmed = TrimLWhiteSpace(TrimRWhiteSpace(self.content))
if dataTrimmed == []:
self.parsed = None
elif self.isOpenDictionary(dataTrimmed[0]) and (self.isCloseDictionary(dataTrimmed[-1]) or self.couldBeCloseDictionary(dataTrimmed[-1])):
self.parsed = self.ParseDictionary(dataTrimmed)[0]
else:
self.parsed = None
def isOpenDictionary(self, token):
return token[0] == CHAR_DELIMITER and token[1] == '<<'
def isCloseDictionary(self, token):
return token[0] == CHAR_DELIMITER and token[1] == '>>'
def couldBeCloseDictionary(self, token):
return token[0] == CHAR_DELIMITER and token[1].rstrip().endswith('>>')
def ParseDictionary(self, tokens):
state = 0 # start
dictionary = []
while tokens != []:
if state == 0:
if self.isOpenDictionary(tokens[0]):
state = 1
else:
return None, tokens
elif state == 1:
if self.isOpenDictionary(tokens[0]):
pass
elif self.isCloseDictionary(tokens[0]):
return dictionary, tokens
elif tokens[0][0] != CHAR_WHITESPACE:
key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput)
value = []
state = 2
elif state == 2:
if self.isOpenDictionary(tokens[0]):
value, tokens = self.ParseDictionary(tokens)
dictionary.append((key, value))
state = 1
elif self.isCloseDictionary(tokens[0]):
dictionary.append((key, value))
return dictionary, tokens
elif value == [] and tokens[0][0] == CHAR_WHITESPACE:
pass
elif value == [] and tokens[0][1] == '[':
value.append(tokens[0][1])
elif value != [] and value[0] == '[' and tokens[0][1] != ']':
value.append(tokens[0][1])
elif value != [] and value[0] == '[' and tokens[0][1] == ']':
value.append(tokens[0][1])
dictionary.append((key, value))
value = []
state = 1
elif value == [] and tokens[0][1] == '(':
value.append(tokens[0][1])
elif value != [] and value[0] == '(' and tokens[0][1] != ')':
if tokens[0][1][0] == '%':
tokens = [tokens[0]] + cPDFTokenizer(StringIO(tokens[0][1][1:])).Tokens() + tokens[1:]
value.append('%')
else:
value.append(tokens[0][1])
elif value != [] and value[0] == '(' and tokens[0][1] == ')':
value.append(tokens[0][1])
balanced = 0
for item in value:
if item == '(':
balanced += 1
elif item == ')':
balanced -= 1
if balanced < 0 and self.verbose:
print('todo 11: ' + repr(value))
if balanced < 1:
dictionary.append((key, value))
value = []
state = 1
elif value != [] and tokens[0][1][0] == '/':
dictionary.append((key, value))
key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput)
value = []
state = 2
else:
value.append(ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput))
tokens = tokens[1:]
def Retrieve(self):
return self.parsed
def PrettyPrintSubElement(self, prefix, e):
if e[1] == []:
print('%s %s' % (prefix, e[0]))
elif type(e[1][0]) == type(''):
if len(e[1]) == 3 and IsNumeric(e[1][0]) and e[1][1] == '0' and e[1][2] == 'R':
joiner = ' '
else:
joiner = ''
value = joiner.join(e[1]).strip()
reprValue = repr(value)
if "'" + value + "'" != reprValue:
value = reprValue
print('%s %s %s' % (prefix, e[0], value))
else:
print('%s %s' % (prefix, e[0]))
self.PrettyPrintSub(prefix + ' ', e[1])
def PrettyPrintSub(self, prefix, dictionary):
if dictionary != None:
print('%s<<' % prefix)
for e in dictionary:
self.PrettyPrintSubElement(prefix, e)
print('%s>>' % prefix)
def PrettyPrint(self, prefix):
self.PrettyPrintSub(prefix, self.parsed)
def Get(self, select):
for key, value in self.parsed:
if key == select:
return value
return None
def GetNestedSub(self, dictionary, select):
for key, value in dictionary:
if key == select:
return self.PrettyPrintSubElement('', [select, value])
if type(value) == type([]) and len(value) > 0 and type(value[0]) == type((None,)):
result = self.GetNestedSub(value, select)
if result !=None:
return self.PrettyPrintSubElement('', [select, result])
return None
def GetNested(self, select):
return self.GetNestedSub(self.parsed, select)
def FormatOutput(data, raw):
if raw:
if type(data) == type([]):
return ''.join(map(lambda x: x[1], data))
else:
return data
elif sys.version_info[0] > 2:
return ascii(data)
else:
return repr(data)
#Fix for http://bugs.python.org/issue11395
def StdoutWriteChunked(data):
if sys.version_info[0] > 2:
sys.stdout.buffer.write(data)
else:
while data != '':
sys.stdout.write(data[0:10000])
try:
sys.stdout.flush()
except IOError:
return
data = data[10000:]
def IfWIN32SetBinary(io):
if sys.platform == 'win32':
import msvcrt
msvcrt.setmode(io.fileno(), os.O_BINARY)
def PrintOutputObject(object, options):
if options.dump == '-':
filtered = object.Stream(options.filter == True, options.overridingfilters)
if filtered == []:
filtered = ''
IfWIN32SetBinary(sys.stdout)
StdoutWriteChunked(filtered)
return
print('obj %d %d' % (object.id, object.version))
if object.objstm != None:
print(' Containing /ObjStm: %d %d' % object.objstm)
print(' Type: %s' % ConditionalCanonicalize(object.GetType(), options.nocanonicalizedoutput))
print(' Referencing: %s' % ', '.join(map(lambda x: '%s %s %s' % x, object.GetReferences())))
dataPrecedingStream = object.ContainsStream()
oPDFParseDictionary = None
if dataPrecedingStream:
print(' Contains stream')
if options.debug:
print(' %s' % FormatOutput(dataPrecedingStream, options.raw))
oPDFParseDictionary = cPDFParseDictionary(dataPrecedingStream, options.nocanonicalizedoutput)
if options.hash:
streamContent = object.Stream(False, options.overridingfilters)
print(' unfiltered')
print(' len: %6d md5: %s' % (len(streamContent), hashlib.md5(streamContent).hexdigest()))
print(' %s' % HexAsciiDumpLine(streamContent))
streamContent = object.Stream(True, options.overridingfilters)
print(' filtered')
print(' len: %6d md5: %s' % (len(streamContent), hashlib.md5(streamContent).hexdigest()))
print(' %s' % HexAsciiDumpLine(streamContent))
streamContent = None
else:
if options.debug or options.raw:
print(' %s' % FormatOutput(object.content, options.raw))
oPDFParseDictionary = cPDFParseDictionary(object.content, options.nocanonicalizedoutput)
print('')
oPDFParseDictionary.PrettyPrint(' ')
print('')
if options.filter and not options.dump:
filtered = object.Stream(overridingfilters=options.overridingfilters)
if filtered == []:
print(' %s' % FormatOutput(object.content, options.raw))
else:
print(' %s' % FormatOutput(filtered, options.raw))
if options.content:
if object.ContainsStream():
stream = object.Stream(False, options.overridingfilters)
if stream != []:
print(' %s' % FormatOutput(stream, options.raw))
else:
print(''.join([token[1] for token in object.content]))
if options.dump:
filtered = object.Stream(options.filter == True, options.overridingfilters)
if filtered == []:
filtered = ''
try:
fDump = open(options.dump, 'wb')
try:
fDump.write(C2BIP3(filtered))
except:
print('Error writing file %s' % options.dump)
fDump.close()
except:
print('Error writing file %s' % options.dump)
print('')
return
def Canonicalize(sIn):
if sIn == '':
return sIn
elif sIn[0] != '/':
return sIn
elif sIn.find('#') == -1:
return sIn
else:
i = 0
iLen = len(sIn)
sCanonical = ''
while i < iLen:
if sIn[i] == '#' and i < iLen - 2:
try:
sCanonical += chr(int(sIn[i+1:i+3], 16))
i += 2
except:
sCanonical += sIn[i]
else:
sCanonical += sIn[i]
i += 1
return sCanonical
def EqualCanonical(s1, s2):
return Canonicalize(s1) == s2
def ConditionalCanonicalize(sIn, nocanonicalizedoutput):
if nocanonicalizedoutput:
return sIn
else:
return Canonicalize(sIn)
# http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer/pdfminer/ascii85.py
def ASCII85Decode(data):
import struct
n = b = 0
out = ''
for c in data:
if '!' <= c and c <= 'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
n = b = 0
elif c == 'z':
assert n == 0
out += '\0\0\0\0'
elif c == '~':
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
break
return out
def ASCIIHexDecode(data):
return binascii.unhexlify(''.join([c for c in data if c not in ' \t\n\r']).rstrip('>'))
# if inflating fails, we try to inflate byte per byte (sample 4da299d6e52bbb79c0ac00bad6a1d51d4d5fe42965a8d94e88a359e5277117e2)
def FlateDecode(data):
try:
return zlib.decompress(C2BIP3(data))
except:
if len(data) <= 10:
raise
oDecompress = zlib.decompressobj()
oStringIO = StringIO()
count = 0
for byte in C2BIP3(data):
try:
oStringIO.write(oDecompress.decompress(byte))
count += 1
except:
break
if len(data) - count <= 2:
return oStringIO.getvalue()
else:
raise
def RunLengthDecode(data):
f = StringIO(data)
decompressed = ''
runLength = ord(f.read(1))
while runLength:
if runLength < 128:
decompressed += f.read(runLength + 1)
if runLength > 128:
decompressed += f.read(1) * (257 - runLength)
if runLength == 128:
break
runLength = ord(f.read(1))
# return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)), data)
return decompressed
#### LZW code sourced from pdfminer
# Copyright (c) 2004-2009 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
class LZWDecoder(object):
def __init__(self, fp):
self.fp = fp
self.buff = 0
self.bpos = 8
self.nbits = 9
self.table = None
self.prevbuf = None
return
def readbits(self, bits):
v = 0
while 1:
# the number of remaining bits we can get from the current buffer.
r = 8-self.bpos
if bits <= r:
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in range(256) ] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[0])
else:
self.table.append(self.prevbuf+self.prevbuf[0])
x = self.table[code]
l = len(self.table)
if l == 511:
self.nbits = 10
elif l == 1023:
self.nbits = 11
elif l == 2047:
self.nbits = 12
self.prevbuf = x
return x
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
x = self.feed(code)
yield x
return
####
def LZWDecode(data):
return ''.join(LZWDecoder(StringIO(data)).run())
def PrintGenerateObject(object, options, newId=None):
if newId == None:
objectId = object.id
else:
objectId = newId
dataPrecedingStream = object.ContainsStream()
if dataPrecedingStream:
if options.filter:
decompressed = object.Stream(True, options.overridingfilters)
if decompressed == 'No filters' or decompressed.startswith('Unsupported filter: '):
print(' oPDF.stream(%d, %d, %s, %s)' % (objectId, object.version, repr(object.Stream(False, options.overridingfilters).rstrip()), repr(re.sub('/Length\s+\d+', '/Length %d', FormatOutput(dataPrecedingStream, True)).strip())))
else:
dictionary = FormatOutput(dataPrecedingStream, True)
dictionary = re.sub(r'/Length\s+\d+', '', dictionary)
dictionary = re.sub(r'/Filter\s*/[a-zA-Z0-9]+', '', dictionary)
dictionary = re.sub(r'/Filter\s*\[.+\]', '', dictionary)
dictionary = re.sub(r'^\s*<<', '', dictionary)
dictionary = re.sub(r'>>\s*$', '', dictionary)
dictionary = dictionary.strip()
print(" oPDF.stream2(%d, %d, %s, %s, 'f')" % (objectId, object.version, repr(decompressed.rstrip()), repr(dictionary)))
else:
print(' oPDF.stream(%d, %d, %s, %s)' % (objectId, object.version, repr(object.Stream(False, options.overridingfilters).rstrip()), repr(re.sub('/Length\s+\d+', '/Length %d', FormatOutput(dataPrecedingStream, True)).strip())))
else:
print(' oPDF.indirectobject(%d, %d, %s)' % (objectId, object.version, repr(FormatOutput(object.content, True).strip())))
def PrintObject(object, options):
if options.generate:
PrintGenerateObject(object, options)
else:
PrintOutputObject(object, options)
def File2Strings(filename):
try:
f = open(filename, 'r')
except:
return None
try:
return map(lambda line:line.rstrip('\n'), f.readlines())
except:
return None
finally:
f.close()
def ProcessAt(argument):
if argument.startswith('@'):
strings = File2Strings(argument[1:])
if strings == None:
raise Exception('Error reading %s' % argument)
else:
return strings
else:
return [argument]
def YARACompile(ruledata):
if ruledata.startswith('#'):
if ruledata.startswith('#h#'):
rule = binascii.a2b_hex(ruledata[3:])
elif ruledata.startswith('#b#'):
rule = binascii.a2b_base64(ruledata[3:])
elif ruledata.startswith('#s#'):
rule = 'rule string {strings: $a = "%s" ascii wide nocase condition: $a}' % ruledata[3:]
elif ruledata.startswith('#q#'):
rule = ruledata[3:].replace("'", '"')
else:
rule = ruledata[1:]
return yara.compile(source=rule)
else:
dFilepaths = {}
if os.path.isdir(ruledata):
for root, dirs, files in os.walk(ruledata):
for file in files:
filename = os.path.join(root, file)
dFilepaths[filename] = filename
else:
for filename in ProcessAt(ruledata):
dFilepaths[filename] = filename
return yara.compile(filepaths=dFilepaths)
def AddDecoder(cClass):
global decoders
decoders.append(cClass)
class cDecoderParent():
pass
def GetScriptPath():
if getattr(sys, 'frozen', False):
return os.path.dirname(sys.executable)
else:
return os.path.dirname(sys.argv[0])
def LoadDecoders(decoders, verbose):
if decoders == '':
return
scriptPath = GetScriptPath()
for decoder in sum(map(ProcessAt, decoders.split(',')), []):
try:
if not decoder.lower().endswith('.py'):
decoder += '.py'
if os.path.dirname(decoder) == '':
if not os.path.exists(decoder):
scriptDecoder = os.path.join(scriptPath, decoder)
if os.path.exists(scriptDecoder):
decoder = scriptDecoder
exec(open(decoder, 'r').read(), globals(), globals())
except Exception as e:
print('Error loading decoder: %s' % decoder)
if verbose:
raise e
class cIdentity(cDecoderParent):
name = 'Identity function decoder'
def __init__(self, stream, options):
self.stream = stream
self.options = options
self.available = True
def Available(self):
return self.available
def Decode(self):
self.available = False
return self.stream
def Name(self):
return ''
def DecodeFunction(decoders, options, stream):
if decoders == []:
return stream
return decoders[0](stream, options.decoderoptions).Decode()
class cDumpStream():
def __init__(self):
self.text = ''
def Addline(self, line):
if line != '':
self.text += line + '\n'
def Content(self):
return self.text
def HexDump(data):
oDumpStream = cDumpStream()
hexDump = ''
for i, b in enumerate(data):
if i % dumplinelength == 0 and hexDump != '':
oDumpStream.Addline(hexDump)
hexDump = ''
hexDump += IFF(hexDump == '', '', ' ') + '%02X' % ord(b)
oDumpStream.Addline(hexDump)
return oDumpStream.Content()
def CombineHexAscii(hexDump, asciiDump):
if hexDump == '':
return ''
return hexDump + ' ' + (' ' * (3 * (dumplinelength - len(asciiDump)))) + asciiDump
def HexAsciiDump(data):
oDumpStream = cDumpStream()
hexDump = ''
asciiDump = ''
for i, b in enumerate(data):
if i % dumplinelength == 0:
if hexDump != '':
oDumpStream.Addline(CombineHexAscii(hexDump, asciiDump))
hexDump = '%08X:' % i
asciiDump = ''
hexDump+= ' %02X' % ord(b)
asciiDump += IFF(ord(b) >= 32, b, '.')
oDumpStream.Addline(CombineHexAscii(hexDump, asciiDump))
return oDumpStream.Content()
def HexAsciiDumpLine(data):
return HexAsciiDump(data[0:16])[10:-1]
def ParseINIFile():
oConfigParser = ConfigParser.ConfigParser(allow_no_value=True)
oConfigParser.optionxform = str
oConfigParser.read(os.path.join(GetScriptPath(), 'pdfid.ini'))
keywords = []
if oConfigParser.has_section('keywords'):
for key, value in oConfigParser.items('keywords'):
if not key in keywords:
keywords.append(key)
return keywords
def MatchObjectID(id, selection):
return str(id) in selection.split(',')
def GetArguments():
arguments = sys.argv[1:]
envvar = os.getenv('PDFPARSER_OPTIONS')
if envvar == None:
return arguments
return envvar.split(' ') + arguments
def Main():
"""pdf-parser, use it to parse a PDF document
"""
global decoders
oParser = optparse.OptionParser(usage='usage: %prog [options] pdf-file|zip-file|url\n' + __description__, version='%prog ' + __version__)
oParser.add_option('-m', '--man', action='store_true', default=False, help='Print manual')
oParser.add_option('-s', '--search', help='string to search in indirect objects (except streams)')
oParser.add_option('-f', '--filter', action='store_true', default=False, help='pass stream object through filters (FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode and RunLengthDecode only)')
oParser.add_option('-o', '--object', help='id(s) of indirect object(s) to select, use comma (,) to separate ids (version independent)')
oParser.add_option('-r', '--reference', help='id of indirect object being referenced (version independent)')
oParser.add_option('-e', '--elements', help='type of elements to select (cxtsi)')
oParser.add_option('-w', '--raw', action='store_true', default=False, help='raw output for data and filters')
oParser.add_option('-a', '--stats', action='store_true', default=False, help='display stats for pdf document')
oParser.add_option('-t', '--type', help='type of indirect object to select')
oParser.add_option('-O', '--objstm', action='store_true', default=False, help='parse stream of /ObjStm objects')
oParser.add_option('-v', '--verbose', action='store_true', default=False, help='display malformed PDF elements')
oParser.add_option('-x', '--extract', help='filename to extract malformed content to')
oParser.add_option('-H', '--hash', action='store_true', default=False, help='display hash of objects')
oParser.add_option('-n', '--nocanonicalizedoutput', action='store_true', default=False, help='do not canonicalize the output')
oParser.add_option('-d', '--dump', help='filename to dump stream content to')
oParser.add_option('-D', '--debug', action='store_true', default=False, help='display debug info')
oParser.add_option('-c', '--content', action='store_true', default=False, help='display the content for objects without streams or with streams without filters')
oParser.add_option('--searchstream', help='string to search in streams')
oParser.add_option('--unfiltered', action='store_true', default=False, help='search in unfiltered streams')
oParser.add_option('--casesensitive', action='store_true', default=False, help='case sensitive search in streams')
oParser.add_option('--regex', action='store_true', default=False, help='use regex to search in streams')
oParser.add_option('--overridingfilters', type=str, default='', help='override filters with given filters (use raw for the raw stream content)')
oParser.add_option('-g', '--generate', action='store_true', default=False, help='generate a Python program that creates the parsed PDF file')
oParser.add_option('--generateembedded', type=int, default=0, help='generate a Python program that embeds the selected indirect object as a file')
oParser.add_option('-y', '--yara', help='YARA rule (or directory or @file) to check streams (can be used with option --unfiltered)')
oParser.add_option('--yarastrings', action='store_true', default=False, help='Print YARA strings')
oParser.add_option('--decoders', type=str, default='', help='decoders to load (separate decoders with a comma , ; @file supported)')
oParser.add_option('--decoderoptions', type=str, default='', help='options for the decoder')
oParser.add_option('-k', '--key', help='key to search in dictionaries')
(options, args) = oParser.parse_args(GetArguments())
if options.man:
oParser.print_help()
PrintManual()
return 0
if len(args) != 1:
oParser.print_help()
print('')
print(' %s' % __description__)
print(' Source code put in the public domain by Didier Stevens, no Copyright')
print(' Use at your own risk')
print(' https://DidierStevens.com')
else:
decoders = []
LoadDecoders(options.decoders, True)
oPDFParser = cPDFParser(args[0], options.verbose, options.extract)
cntComment = 0
cntXref = 0
cntTrailer = 0
cntStartXref = 0
cntIndirectObject = 0
dicObjectTypes = {}
keywords = ['/JS', '/JavaScript', '/AA', '/OpenAction', '/AcroForm', '/RichMedia', '/Launch', '/EmbeddedFile', '/XFA', '/URI']
for extrakeyword in ParseINIFile():
if not extrakeyword in keywords:
keywords.append(extrakeyword)
# dKeywords = {keyword: [] for keyword in keywords}
# Done for compatibility with 2.6.6
dKeywords = {}
for keyword in keywords:
dKeywords[keyword] = []
selectComment = False
selectXref = False
selectTrailer = False
selectStartXref = False
selectIndirectObject = False
if options.elements:
for c in options.elements:
if c == 'c':
selectComment = True
elif c == 'x':
selectXref = True
elif c == 't':
selectTrailer = True
elif c == 's':
selectStartXref = True
elif c == 'i':
selectIndirectObject = True
else:
print('Error: unknown --elements value %s' % c)
return
else:
selectIndirectObject = True
if not options.search and not options.object and not options.reference and not options.type and not options.searchstream and not options.key:
selectComment = True
selectXref = True
selectTrailer = True
selectStartXref = True
if options.search or options.key or options.reference:
selectTrailer = True
if options.type == '-':
optionsType = ''
else:
optionsType = options.type
if options.generate or options.generateembedded != 0:
savedRoot = ['1', '0', 'R']
print('#!/usr/bin/python')
print('')
print('"""')
print('')
print('Program generated by pdf-parser.py by Didier Stevens')
print('https://DidierStevens.com')
print('Use at your own risk')
print('')
print('Input PDF file: %s' % args[0])
print('This Python program was created on: %s' % Timestamp())
print('')
print('"""')
print('')
print('import mPDF')
print('import sys')
print('')
print('def Main():')
print(' if len(sys.argv) != 2:')
print(" print('Usage: %s pdf-file' % sys.argv[0])")
print(' return')
print(' oPDF = mPDF.cPDF(sys.argv[1])')
if options.generateembedded != 0:
print(" oPDF.header('1.1')")
print(r" oPDF.comment('\xd0\xd0\xd0\xd0')")
print(r" oPDF.indirectobject(1, 0, '<<\r\n /Type /Catalog\r\n /Outlines 2 0 R\r\n /Pages 3 0 R\r\n /Names << /EmbeddedFiles << /Names [(test.bin) 7 0 R] >> >>\r\n>>')")
print(r" oPDF.indirectobject(2, 0, '<<\r\n /Type /Outlines\r\n /Count 0\r\n>>')")
print(r" oPDF.indirectobject(3, 0, '<<\r\n /Type /Pages\r\n /Kids [4 0 R]\r\n /Count 1\r\n>>')")
print(r" oPDF.indirectobject(4, 0, '<<\r\n /Type /Page\r\n /Parent 3 0 R\r\n /MediaBox [0 0 612 792]\r\n /Contents 5 0 R\r\n /Resources <<\r\n /ProcSet [/PDF /Text]\r\n /Font << /F1 6 0 R >>\r\n >>\r\n>>')")
print(r" oPDF.stream(5, 0, 'BT /F1 12 Tf 70 700 Td 15 TL (This PDF document embeds file test.bin) Tj ET', '<< /Length %d >>')")
print(r" oPDF.indirectobject(6, 0, '<<\r\n /Type /Font\r\n /Subtype /Type1\r\n /Name /F1\r\n /BaseFont /Helvetica\r\n /Encoding /MacRomanEncoding\r\n>>')")
print(r" oPDF.indirectobject(7, 0, '<<\r\n /Type /Filespec\r\n /F (test.bin)\r\n /EF << /F 8 0 R >>\r\n>>')")
if options.yara != None:
if not 'yara' in sys.modules:
print('Error: option yara requires the YARA Python module.')
return
rules = YARACompile(options.yara)
oPDFParserOBJSTM = None
while True:
if oPDFParserOBJSTM == None:
object = oPDFParser.GetObject()
else:
object = oPDFParserOBJSTM.GetObject()
if object == None:
oPDFParserOBJSTM = None
object = oPDFParser.GetObject()
if options.objstm and hasattr(object, 'GetType') and EqualCanonical(object.GetType(), '/ObjStm') and object.ContainsStream():
# parsing objects inside an /ObjStm object by extracting & parsing the stream content to create a synthesized PDF document, that is then parsed by cPDFParser
oPDFParseDictionary = cPDFParseDictionary(object.ContainsStream(), options.nocanonicalizedoutput)
numberOfObjects = int(oPDFParseDictionary.Get('/N')[0])
offsetFirstObject = int(oPDFParseDictionary.Get('/First')[0])
indexes = list(map(int, C2SIP3(object.Stream())[:offsetFirstObject].strip().split(' ')))
if len(indexes) % 2 != 0 or len(indexes) / 2 != numberOfObjects:
raise Exception('Error in index of /ObjStm stream')
streamObject = C2SIP3(object.Stream()[offsetFirstObject:])
synthesizedPDF = ''
while len(indexes) > 0:
objectNumber = indexes[0]
offset = indexes[1]
indexes = indexes[2:]
if len(indexes) >= 2:
offsetNextObject = indexes[1]
else:
offsetNextObject = len(streamObject)
synthesizedPDF += '%d 0 obj\n%s\nendobj\n' % (objectNumber, streamObject[offset:offsetNextObject])
oPDFParserOBJSTM = cPDFParser(StringIO(synthesizedPDF), options.verbose, options.extract, (object.id, object.version))
if object != None:
if options.stats:
if object.type == PDF_ELEMENT_COMMENT:
cntComment += 1
elif object.type == PDF_ELEMENT_XREF:
cntXref += 1
elif object.type == PDF_ELEMENT_TRAILER:
cntTrailer += 1
elif object.type == PDF_ELEMENT_STARTXREF:
cntStartXref += 1
elif object.type == PDF_ELEMENT_INDIRECT_OBJECT:
cntIndirectObject += 1
type1 = object.GetType()
if not type1 in dicObjectTypes:
dicObjectTypes[type1] = [object.id]
else:
dicObjectTypes[type1].append(object.id)
for keyword in dKeywords.keys():
if object.ContainsName(keyword):
dKeywords[keyword].append(object.id)
else:
if object.type == PDF_ELEMENT_COMMENT and selectComment:
if options.generate:
comment = object.comment[1:].rstrip()
if re.match('PDF-\d\.\d', comment):
print(" oPDF.header('%s')" % comment[4:])
elif comment != '%EOF':
print(' oPDF.comment(%s)' % repr(comment))
elif options.yara == None and options.generateembedded == 0:
print('PDF Comment %s' % FormatOutput(object.comment, options.raw))
print('')
elif object.type == PDF_ELEMENT_XREF and selectXref:
if not options.generate and options.yara == None and options.generateembedded == 0:
if options.debug:
print('xref %s' % FormatOutput(object.content, options.raw))
else:
print('xref')
print('')
elif object.type == PDF_ELEMENT_TRAILER and selectTrailer:
oPDFParseDictionary = cPDFParseDictionary(object.content[1:], options.nocanonicalizedoutput)
if options.generate:
result = oPDFParseDictionary.Get('/Root')
if result != None:
savedRoot = result
elif options.yara == None and options.generateembedded == 0:
if not options.search and not options.key and not options.reference or options.search and object.Contains(options.search):
if oPDFParseDictionary == None:
print('trailer %s' % FormatOutput(object.content, options.raw))
else:
print('trailer')
oPDFParseDictionary.PrettyPrint(' ')
print('')
elif options.key:
if oPDFParseDictionary.parsed != None:
result = oPDFParseDictionary.GetNested(options.key)
if result != None:
print(result)
elif options.reference:
for key, value in oPDFParseDictionary.Retrieve():
if value == [str(options.reference), '0', 'R']:
print('trailer')
oPDFParseDictionary.PrettyPrint(' ')
elif object.type == PDF_ELEMENT_STARTXREF and selectStartXref:
if not options.generate and options.yara == None and options.generateembedded == 0:
print('startxref %d' % object.index)
print('')
elif object.type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject:
if options.search:
if object.Contains(options.search):
PrintObject(object, options)
elif options.key:
contentDictionary = object.ContainsStream()
if not contentDictionary:
contentDictionary = object.content[1:]
oPDFParseDictionary = cPDFParseDictionary(contentDictionary, options.nocanonicalizedoutput)
if oPDFParseDictionary.parsed != None:
result = oPDFParseDictionary.GetNested(options.key)
if result != None:
print(result)
elif options.object:
if MatchObjectID(object.id, options.object):
PrintObject(object, options)
elif options.reference:
if object.References(options.reference):
PrintObject(object, options)
elif options.type:
if EqualCanonical(object.GetType(), optionsType):
PrintObject(object, options)
elif options.hash:
print('obj %d %d' % (object.id, object.version))
rawContent = FormatOutput(object.content, True)
print(' len: %d md5: %s' % (len(rawContent), hashlib.md5(rawContent).hexdigest()))
print('')
elif options.searchstream:
if object.StreamContains(options.searchstream, not options.unfiltered, options.casesensitive, options.regex, options.overridingfilters):
PrintObject(object, options)
elif options.yara != None:
results = object.StreamYARAMatch(rules, decoders, options.decoderoptions, not options.unfiltered, options.overridingfilters)
if results != None and results != []:
for result in results:
for yaraResult in result[1]:
print('YARA rule%s: %s (%s)' % (IFF(result[0] == '', '', ' (stream decoder: %s)' % result[0]), yaraResult.rule, yaraResult.namespace))
if options.yarastrings:
for stringdata in yaraResult.strings:
print('%06x %s:' % (stringdata[0], stringdata[1]))
print(' %s' % binascii.hexlify(C2BIP3(stringdata[2])))
print(' %s' % repr(stringdata[2]))
PrintObject(object, options)
elif options.generateembedded != 0:
if object.id == options.generateembedded:
PrintGenerateObject(object, options, 8)
else:
PrintObject(object, options)
elif object.type == PDF_ELEMENT_MALFORMED:
try:
fExtract = open(options.extract, 'wb')
try:
fExtract.write(C2BIP3(object.content))
except:
print('Error writing file %s' % options.extract)
fExtract.close()
except:
print('Error writing file %s' % options.extract)
else:
break
if options.stats:
print('Comment: %s' % cntComment)
print('XREF: %s' % cntXref)
print('Trailer: %s' % cntTrailer)
print('StartXref: %s' % cntStartXref)
print('Indirect object: %s' % cntIndirectObject)
for key in sorted(dicObjectTypes.keys()):
print(' %s %d: %s' % (key, len(dicObjectTypes[key]), ', '.join(map(lambda x: '%d' % x, dicObjectTypes[key]))))
if sum(map(len, dKeywords.values())) > 0:
print('Search keywords:')
for keyword in keywords:
if len(dKeywords[keyword]) > 0:
print(' %s %d: %s' % (keyword, len(dKeywords[keyword]), ', '.join(map(lambda x: '%d' % x, dKeywords[keyword]))))
if options.generate or options.generateembedded != 0:
print(" oPDF.xrefAndTrailer('%s')" % ' '.join(savedRoot))
print('')
print("if __name__ == '__main__':")
print(' Main()')
def TestPythonVersion(enforceMaximumVersion=False, enforceMinimumVersion=False):
if sys.version_info[0:3] > __maximum_python_version__:
if enforceMaximumVersion:
print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
print('Please use Python version %d.%d.%d' % __maximum_python_version__)
sys.exit()
else:
print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__)
if sys.version_info[0:3] < __minimum_python_version__:
if enforceMinimumVersion:
print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
print('Please use Python version %d.%d.%d' % __maximum_python_version__)
sys.exit()
else:
print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__)
if __name__ == '__main__':
TestPythonVersion()
Main()