forked from dchad/malware-detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_pdf_tokens.py
160 lines (109 loc) · 4.36 KB
/
generate_pdf_tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# generate_pdf_tokens.py
#
# Parse a bunch of PDF files generated by objdump and
# extract keywords such as names and streams.
#
# Inputs : list of PDF files.
# Temp file name for token counts.
# File name for token counts.
#
# Outputs: pdf-tokens.txt
# pdf-token-counts.csv
# row format = [token_name, count]
#
# Author: Derek Chadwick
# Date : 02/01/2017
import os
import re
from csv import writer
def save_token_counts(token_counter_map, out_file_name):
# Output the PDF token counts.
pid = os.getpid()
out_file = "data/" + str(pid) + "-" + out_file_name
fop = open(out_file, 'w')
csv_wouter = writer(fop)
csv_wouter.writerow(["token_name","count"])
outlines = []
sorted_keys = token_counter_map.keys()
sorted_keys.sort()
counter = 0
for key in sorted_keys:
outlines.append([key, token_counter_map[key]])
counter += 1
if (counter % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
fop.close()
print("Completed writing PDF {:d} tokens.".format(len(sorted_keys)))
return
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def is_printable_ascii(s):
return all(ord(c) > 31 and ord(c) < 127 for c in s)
def generate_pdf_tokens(mp_params):
# Parse a bunch of PDF files and generate tokens for names and objects in the file.
file_list = mp_params.file_list
out_count_file = mp_params.count_file
out_token_file = mp_params.token_file
token_counter_map = {}
counter = 0
pid = os.getpid()
for idx, fname in enumerate(file_list):
fip = open(fname, 'r')
in_lines = fip.readlines()
counter += 1
for line in in_lines:
line = line.rstrip() # get rid of newlines they are annoying.
line = line.replace(',', ' ').replace('\t', ' ').replace('\\', ' ').replace('-',' ')
line = line.replace(';', ' ').replace(':', ' ').replace('<', ' ').replace('>', ' ')
line = line.replace('(', ' ').replace('[', ' ').replace(']', ' ').replace(')',' ')
line = line.replace('\"', ' ').replace('\'', ' ').replace('\n',' ')
if len(line) < 4: # Ignore all the chaff and crap.
continue
if not is_printable_ascii(line): # Ignore all the weird encodings they are annoying.
continue
# Check for the objects we are interested in, mostly names and streams
# in the PDF document.
if line.startswith("/") or line.startswith("end"):
tokens = line.split(" ")
for token_val in tokens:
if len(token_val) < 4 or len(token_val) > 32:
continue
# Count the token type.
if token_val.startswith("/") or token_val.startswith("end"):
if token_val in token_counter_map.keys():
token_counter_map[token_val] += 1
else:
token_counter_map[token_val] = 1
if (counter % 10) == 0:
print("{:d} Processed {:d} PDF files.".format(pid, counter))
fip.close()
save_token_counts(token_counter_map, out_count_file)
return
class Multi_Params(object):
def __init__(self, tokenfile="", countfile="", filelist=[]):
self.token_file = tokenfile
self.count_file = countfile
self.file_list = filelist
# Start of Program.
#token_file = 'pe-header-tokens-vs264.txt'
#count_file = 'pe-header-token-counts-vs264.csv'
#ext_drive = '/opt/vs/train4hdr/'
token_file = "pdf-tokens-non-malicious-set.csv"
count_file = "pdf-token-counts-non-malicious-set.csv"
ext_drive = "/opt/vs/pdfset/"
file_list = os.listdir(ext_drive)
tfiles = []
for fname in file_list:
if fname.endswith(".pdf"):
tfiles.append(ext_drive + fname)
print("Got {:d} PDF files.".format(len(tfiles)))
mp1 = Multi_Params(token_file, count_file, tfiles)
generate_pdf_tokens(mp1)
print("Completed processing {:d} PDF files.".format(len(tfiles)))
# End of Program.