通过 搜集 Linux 服务器 的 bash 操作 日志, 通过 训练 识别 出 特定 用户 的 操作 习惯, 然后 进一步 识别 出 异常 操作 行为。
使用 SEA 数据 集 涵盖 70 多个 UNIX 系统 用户 的 行为 日志, 这些 数据 来自 UNIX 系统 acct 机制 记录 的 用户 使用 的 命令。 SEA 数据 集中 每个 用户 都 采集 了 15000 条 命令, 从 用户 集合 中 随机 抽取 50 个 用户 作为 正常 用户, 剩余 用户 的 命令 块 中 随机 插入 模拟 命令 作为 内部 伪装 者 攻击 数据。其中 训练 集合 大小 为 80, 测试 集合 大小 为 70。数据集示意:
cppshxrdbcppshxrdbmkptsteststtyhostnamedateecho[findchmodttyechoenvechoshuserenvwait4wmxhostxsetrootreaperxmodmapsh[catsttyhostnamedateecho[findchmodttyechoshmoreshmoreshmoreshmoreshmoreshmoreshmoreshmoreshmoreshmoreshmoreshlauncheflaunchefsh9termshlaunchefshlaunchefhostname[catsttyhostnamedateecho[findchmodttyechoshmoreshmoreshexsendmailsendmailshMediaMaisendmailshrmMediaMaishrmMediaMailauncheflaunchefshshmoreshshrmMediaMainetstatnetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapenetscapeshnetscapemoreshrmshMediaMai=telnettputnetscapenetscapenetscapenetscapenetscape
# -*- coding:utf-8 -*-import sysimport reimport numpy as npimport nltkimport csvimport matplotlib.pyplot as pltfrom nltk.probability import FreqDistfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn import cross_validationfrom tflearn.data_utils import to_categorical, pad_sequencesfrom tflearn.datasets import imdbimport tflearn#测试样本数N=80def load_user_cmd_new(filename): cmd_list=[] dist=[] with open(filename) as f: i=0 x=[] for line in f: line=line.strip('\n') x.append(line) dist.append(line) i+=1 if i == 100: cmd_list.append(x) x=[] i=0 fdist = FreqDist(dist).keys() return cmd_list,fdistdef load_user_cmd(filename): cmd_list=[] dist_max=[] dist_min=[] dist=[] with open(filename) as f: i=0 x=[] for line in f: line=line.strip('\n') x.append(line) dist.append(line) i+=1 if i == 100: cmd_list.append(x) x=[] i=0 fdist = FreqDist(dist).keys() dist_max=set(fdist[0:50]) dist_min = set(fdist[-50:]) return cmd_list,dist_max,dist_mindef get_user_cmd_feature(user_cmd_list,dist_max,dist_min): user_cmd_feature=[] for cmd_block in user_cmd_list: f1=len(set(cmd_block)) fdist = FreqDist(cmd_block).keys() f2=fdist[0:10] f3=fdist[-10:] f2 = len(set(f2) & set(dist_max)) f3=len(set(f3)&set(dist_min)) x=[f1,f2,f3] user_cmd_feature.append(x) return user_cmd_featuredef get_user_cmd_feature_new(user_cmd_list,dist): user_cmd_feature=[] for cmd_list in user_cmd_list: x=[] for cmd in cmd_list: v = [0] * len(dist) for i in range(0, len(dist)): if cmd == dist[i]: v[i] = 1 x.append(v) user_cmd_feature.append(x) return user_cmd_featuredef get_label(filename,index=0): x=[] with open(filename) as f: for line in f: line=line.strip('\n') x.append( int(line.split()[index])) return xdef do_knn(x_train,y_train,x_test,y_test): neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(x_train, y_train) y_predict=neigh.predict(x_test) score = np.mean(y_test == y_predict) * 100 print scoredef do_rnn(x_train,x_test,y_train,y_test): global n_words # Data preprocessing # Sequence padding print "GET n_words embedding %d" % n_words #x_train = pad_sequences(x_train, maxlen=100, value=0.) #x_test = pad_sequences(x_test, maxlen=100, value=0.) # Converting labels to binary vectors y_train = to_categorical(y_train, nb_classes=2) y_test = to_categorical(y_test, nb_classes=2) # Network building net = tflearn.input_data(shape=[None, 100,n_words]) net = tflearn.lstm(net, 10, return_seq=True) net = tflearn.lstm(net, 10, ) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,name="output", loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=3) model.fit(x_train, y_train, validation_set=(x_test, y_test), show_metric=True, batch_size=32,run_id="maidou")if __name__ == '__main__': user_cmd_list,dist=load_user_cmd_new("../data/MasqueradeDat/User7") #print "Dist:(%s)" % dist n_words=len(dist) user_cmd_feature=get_user_cmd_feature_new(user_cmd_list,dist) labels=get_label("../data/MasqueradeDat/label.txt",6) y=[0]*50+labels x_train=user_cmd_feature[0:N] y_train=y[0:N] x_test=user_cmd_feature[N:150] y_test=y[N:150] #print x_train do_rnn(x_train,x_test,y_train,y_test)
效果:
Training Step: 30 | total loss: 0.10088 | time: 1.185s
| Adam | epoch: 010 | loss: 0.10088 - acc: 0.9591 | val_loss: 0.18730 - val_acc: 0.9571 -- iter: 80/80--