-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathopcode_seq.py
More file actions
97 lines (88 loc) · 2.7 KB
/
opcode_seq.py
File metadata and controls
97 lines (88 loc) · 2.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import glob
import re
from collections import *
import os
# import pandas as pd
def getOpcodeSequence_(filename):
opcode_seq = []
p = re.compile(r'\s([a-fA-F0-9]{2}\s)+\s*([a-z]+)')
with open(filename) as f:
for line in f:
# if line.startswith(".text"):
if True:
line = line.strip()
m = re.findall(p,line)
if m:
opc = m[0][1]
if opc != "align":
opcode_seq.append(opc)
return opcode_seq
#
# def train_opcode_lm(ops, order=4):
# lm = defaultdict(Counter)
# prefix = ["~"] * order
# prefix.extend(ops)
# data = prefix
# for i in xrange(len(data)-order):
# history, char = tuple(data[i:i+order]), data[i+order]
# lm[history][char]+=1
# def normalize(counter):
# s = float(sum(counter.values()))
# return [(c,cnt/s) for c,cnt in counter.iteritems()]
# outlm = {hist:chars for hist, chars in lm.iteritems()}
# return outlm
#
# def getOpcodeNgram(ops, n=3):
# opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)]
# opngram = Counter(opngramlist)
# return opngram
#
# basepath = "/home/moon/subtrain/"
# map3gram = defaultdict(Counter)
# subtrain = pd.read_csv('subtrainLabels.csv')
# count = 1
# for sid in subtrain.Id:
# print "counting the 3-gram of the {0} file...".format(str(count))
# count += 1
# filename = basepath + sid + ".asm"
# ops = getOpcodeSequence(filename)
# op3gram = getOpcodeNgram(ops)
# map3gram[sid] = op3gram
#
# cc = Counter([])
# for d in map3gram.values():
# cc += d
# selectedfeatures = {}
# tc = 0
# for k,v in cc.iteritems():
# if v >= 500:
# selectedfeatures[k] = v
# print k,v
# tc += 1
# dataframelist = []
# for fid,op3gram in map3gram.iteritems():
# standard = {}
# standard["Id"] = fid
# for feature in selectedfeatures:
# if feature in op3gram:
# standard[feature] = op3gram[feature]
# else:
# standard[feature] = 0
# dataframelist.append(standard)
# df = pd.DataFrame(dataframelist)
# df.to_csv("3gramfeature.csv",index=False)
def getOpcodeSequence(directorypath):
# path = os.getcwd()
# os.chdir('%s/kaggle' % path)
path = directorypath
os.chdir(path)
for ff in glob.glob("*.asm"):
seglist = getOpcodeSequence_(ff)
writefile = ff.split('.')[0]+'.txt'
with open(writefile,'w') as wf:
wf.write(' '.join(seglist))
if __name__ == '__main__':
directorypath = os.getcwd()+'/data/benign'
getOpcodeSequence(directorypath)
# directorypath = os.getcwd()+'/data/malware'
# getOpcodeSequence(directorypath)