forked from dchad/malware-detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_extraction_pe_function_counts.py
254 lines (181 loc) · 8.65 KB
/
feature_extraction_pe_function_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# generate_function_counts.py
#
# Read a bunch of call graph files in GraphViz format and generate function count
# feature sets in CSV format.
#
# Input : Call graph files in GraphViz format.
# GraphViz directed graph format:
#
# digraph graph_name {
# node_name1 -> { node_name2 ; node_name3 ; .... }
# .
# .
# .
# node_nameX -> { node_nameY ; node_nameZ ; .... }
# }
#
#
# Output: all-reduced-function-counts.csv
# row format = [file_name, [list of function names]...]
#
# all-reduced-function-column-names-singleline.txt
# all-reduced-function-column-names-multiline.txt
# (lists of function names extracted from the call graphs to be used as column names for the feature sets)
#
# Author: Derek Chadwick
# Date : 14/11/2016
#
# TODO: optimise and many many things
import numpy as np
import pandas as pd
import graph as gra # http://www.python-course.eu/graphs_python.php
import os
from csv import writer
from multiprocessing import Pool
import re
def reduce_column_names(column_names_file):
# Reduce the number of column names, there are just too many.
colf = open(column_names_file, 'r')
all_column_names = []
column_lines = colf.readlines()
for line in column_lines:
# lets try to reduce the vast number of functions.
func = line.rstrip()
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5]
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:5]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:5]
elif func.startswith('byte_') or func.startswith('word_') or func.startswith('off_'):
func = func[:5]
elif func.startswith('_'):
func = func[:5]
if len(func) > 16:
func = func[:16]
if len(func) < 1:
continue
if func not in all_column_names:
all_column_names.append(func)
col_names_len = len(all_column_names)
colf.close()
print("Column Names Length: {:d}".format(col_names_len))
return all_column_names
def generate_function_counts(call_graph_file_list, column_names_file, feature_file_name):
# Generate function counts from graph files of the ASM malware samples.
counter = 0
error_count = 0
graph_name = 'none'
graph_counter = 0
all_column_names = reduce_column_names(column_names_file)
col_names_len = len(all_column_names)
feature_column_names = all_column_names[1:] # Skip the "filename" column.
pid = os.getpid()
print('Process id: {:d}'.format(pid))
print('Call graph function counts file: {:s}'.format(feature_file_name))
feature_file = open(feature_file_name, 'w')
fw = writer(feature_file)
fw.writerow(all_column_names)
call_graph_function_features = []
function_counts = [0] * (col_names_len - 1) # NOTE: do not include the sample name in the function counts.
for call_graph_file in call_graph_file_list:
print("Processing call graph file: {:s}".format(call_graph_file))
with open(call_graph_file, 'r') as cfg:
for line in cfg:
if line.startswith(' }'): # End of current graph, append the function counts to the feature list.
call_graph_function_features.append([graph_name] + function_counts)
continue
if line.startswith('digraph'):
function_counts = [0] * (col_names_len - 1) # NOTE: do not include the sample name in the function counts.
tokens = line.split(' ')
graph_name = tokens[1]
graph_counter += 1
continue
line.rstrip() # get rid of newlines they are annoying.
# get rid of all these things they are annoying.
line = line.replace(';',' ').replace('{',' ').replace('}',' ').replace('->',' ').replace('\'',' ').replace('\"',' ')
parts = line.split() # tokenize graph line
function_dict = {}
# now generate the function counts for this call graph
for func in parts:
# lets try to reduce the vast number of functions.
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5]
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:5]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:5]
elif func.startswith('byte_') or func.startswith('word_') or func.startswith('off_'):
func = func[:5]
elif func.startswith('_'):
func = func[:5]
if len(func) > 16:
func = func[:16]
if (func in function_dict):
function_dict[func] += 1
else:
function_dict[func] = 1
# now generate the output row for this call graph
for func in function_dict:
for idx, cname in enumerate(feature_column_names): # NOTE: Do not include the "filename" column.
if func == cname:
function_counts[idx] = function_dict[func]
break
# Print progress and write out rows
counter += 1
if ((counter + 1) % 10000) == 0:
print("{:d} GraphViz File: {:s} Graph: {:s} Graph Count: {:d} Line Count: {:d}".format(pid, call_graph_file, graph_name, graph_counter, counter))
fw.writerows(call_graph_function_features)
call_graph_function_features = []
# Write remaining files
if len(call_graph_function_features) > 0:
fw.writerows(call_graph_function_features)
call_graph_function_features = []
# We are done.
feature_file.close()
validate_feature_set(feature_file_name, col_names_len)
print("Completed processing {:d} graph lines.".format(counter))
return
def validate_feature_set(feature_set_file_name, feature_set_len):
fip = open(feature_set_file_name, 'r')
fop = open("data/temp-pe-function-count-errors.txt", 'w')
feature_count = 0
error_count = 0
for line in fip:
line = line.rstrip()
tokens = line.split(',')
sample_name = tokens[0]
feature_count = len(tokens)
if feature_count != feature_set_len:
print("ERROR: Feature counts inconsistent: {:d} {:d}".format(feature_count, feature_set_len))
error_count += 1
fop.write(str(feature_count) + " : " + line[:128] + '\n')
if sample_name.endswith('.elf'):
print("ERROR: ELF sample in PE feature set: {:s}".format(sample_name))
error_count += 1
fop.write(str(feature_count) + " : " + line[:128] + '\n')
fip.close()
fop.close()
print("Feature set validation: {:d} errors.".format(error_count))
return
# Start of script.
if __name__ == "__main__":
#TODO: parse command line options for input/output file names.
ext_drive = '/opt/vs/'
sample_set_id = 'vs263'
#sample_set_id = 'apt'
feature_file_name = 'data/call-graph-reduced-function_counts-' + sample_set_id + '.csv'
# DEPRECATED: column_name_file = 'data/all-column-names-single-line-' + sample_set_id + '.txt'
column_name_file = 'data/all-column-names-multi-line-' + sample_set_id + '.txt'
pecallgraphs = re.compile('\d{3,5}-pe-call-graphs-' + sample_set_id +'.gv')
file_list = os.listdir(ext_drive)
call_graph_files = []
counter = 0
for file_name in file_list:
if pecallgraphs.match(file_name):
call_graph_files.append('/opt/vs/' + file_name)
counter += 1
print("Found call graph file: {:s}".format(file_name))
generate_function_counts(call_graph_files, column_name_file, feature_file_name)
print("Completed function counts for sample set id: {:s}".format(sample_set_id))
# End of Script