forked from dchad/malware-detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_function_column_names.py
222 lines (167 loc) · 7.66 KB
/
generate_function_column_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# generate_function_column_names.py
#
# Read a bunch of call graph files in GraphViz format and generate column names for function count
# feature sets in txt format.
#
# Input : Call graph files in GraphViz format.
# GraphViz directed graph format:
#
# digraph graph_name {
# node_name1 -> { node_name2 ; node_name3 ; .... }
# .
# .
# .
# node_nameX -> { node_nameY ; node_nameZ ; .... }
# }
#
#
# Output:
# all-reduced-function-column-names-singleline.txt
# all-reduced-function-column-names-multiline.txt
# (lists of function names extracted from the call graphs to be used as column names for the feature sets)
#
# Author: Derek Chadwick
# Date : 14/11/2016
#
# TODO: optimise and many many things
import numpy as np
import pandas as pd
import graph as gra # http://www.python-course.eu/graphs_python.php
import os
from csv import writer
from multiprocessing import Pool
import re
# Determine if a string is all ascii characters.
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def is_printable_ascii(s):
return all(ord(c) > 31 and ord(c) < 127 for c in s)
def generate_column_names(call_graph_file, column_file):
counter = 0
column_names = ['filename']
graph_names = []
graph_name = "none"
graph_functions = {}
#fapi = open("data/APIs.txt")
#defined_apis = fapi.readlines()
#defined_apis = defined_apis[0].split(',')
#fapi.close()
pid = os.getpid()
print('Process id: {:d}'.format(pid))
#column_names_file = 'data/' + str(pid) + '-' + column_file
print('Column names file: {:s}'.format(column_file))
#graph_names_file = 'data/' + str(pid) + '-graph-names.csv'
#print('Graph names file: {:s}'.format(graph_names_file))
with open(call_graph_file, 'r') as cfg:
print("Starting graph file: {:s}".format(call_graph_file))
for line in cfg:
if line.startswith('digraph'):
tokens = line.split()
graph_name = tokens[1]
graph_names.append(graph_name)
continue
line = line.rstrip('\r\n') # get rid of newlines they are annoying.
# get rid of all these things they are annoying.
line = line.replace(';',' ').replace('{',' ').replace('}',' ').replace('->',' ').replace('\'',' ').replace('\"',' ')
parts = line.split() # tokenize call graph line
#graph_name = parts[0] # this is for single line call graphs.
#parts = parts[1:]
#graph_names.append(graph_name)
#graph_functions = {}
for func in parts:
# lets try to reduce the vast number of functions.
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5]
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:5]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:5]
elif func.startswith('byte_') or func.startswith('word_') or func.startswith('off_'):
func = func[:5]
elif func.startswith('_'):
func = func[:5]
if len(func) > 16:
func = func[:16]
if func not in column_names:
column_names.append(func)
counter += 1
# Print progress
if ((counter + 1) % 10000) == 0:
print("Call Graph File: {:s} Processed line number {:d} Graph name {:s} Total column names {:d}".format(call_graph_file, counter, graph_name, len(column_names)))
with open(column_file, 'w') as fop:
fw = writer(fop)
fw.writerow(column_names)
print("Completed writing {:d} column names.".format(len(column_names)))
#with open(graph_names_file, 'w') as gras:
# fw = writer(gras)
# fw.writerow(graph_names)
#print("Completed writing {:d} graph names.".format(len(graph_names)))
return
def merge_column_names_single_line(column_name_files, combined_column_name_file):
# Generate the merged column names file single line.
counter = 0
column_names = []
for cnamefile in column_name_files:
with open(cnamefile, 'r') as cras:
print("Singleline Starting file: {:s}".format(cnamefile))
colstr = cras.readline()
colnames = colstr.split(',')
for cname in colnames:
if cname not in column_names:
column_names.append(cname)
counter += 1
# Print progress
if ((counter + 1) % 1000) == 0:
print("{:s} Processed column names {:d}".format(cnamefile, counter))
with open(combined_column_name_file, 'w') as cols:
fw = writer(cols)
fw.writerow(column_names)
print("Singleline - completed writing column names total = {:d}".format(len(column_names)))
return
def merge_column_names_multi_line(column_name_files, combined_column_name_file):
#Generate the merged column names file multiline.
counter = 0
column_names = []
for cnamefile in column_name_files:
with open(cnamefile, 'r') as cras:
print("Multiline Starting file: {:s}".format(cnamefile))
colstr = cras.readline()
colnames = colstr.split(',')
for cname in colnames:
if cname not in column_names:
column_names.append(cname)
counter += 1
# Print progress
if ((counter + 1) % 1000) == 0:
print("{:s} Processed column names {:d}".format(cnamefile, counter))
with open(combined_column_name_file, 'w') as cols:
for cname in column_names:
outline = cname + "\n"
cols.write(outline)
print("Multiline - completed writing column names total = {:d}".format(len(column_names)))
return
# Start of script.
if __name__ == "__main__":
#TODO: parse command line options for input/output file names/sample set id etc...
ext_drive = '/opt/vs/'
sample_set_id = 'vs254'
pecallgraphs = re.compile('\d{3,5}-pe-call-graphs-' + sample_set_id +'.gv') # This is the PID prefix for each file.
file_list = os.listdir(ext_drive)
call_graph_files = []
column_name_files = []
counter = 0
for file_name in file_list:
if pecallgraphs.match(file_name):
call_graph_files.append(file_name)
counter += 1
column_name_file = 'data/reduced-column-names-' + sample_set_id + '-' + str(counter) + '.txt'
column_name_files.append(column_name_file)
print("Found call graph file: {:s}".format(file_name))
print("Column name file: {:s}".format(column_name_file))
for idx, call_graph_file_name in enumerate(call_graph_files):
print("Doing column names: {:s} - {:s}".format(call_graph_file_name, column_name_files[idx]))
generate_column_names('/opt/vs/' + call_graph_file_name, column_name_files[idx])
# We are going to generate single line and multi line files for convenience.
merge_column_names_single_line(column_name_files, 'data/all-column-names-single-line-' + sample_set_id + '.txt')
merge_column_names_multi_line(column_name_files, 'data/all-column-names-multi-line-' + sample_set_id + '.txt')
# End of Script