Skip to content

Instantly share code, notes, and snippets.

@hgascon
Created May 18, 2016 16:17
Show Gist options
  • Save hgascon/b5982733ff7db0282beececa82d74fd0 to your computer and use it in GitHub Desktop.
Save hgascon/b5982733ff7db0282beececa82d74fd0 to your computer and use it in GitHub Desktop.
Read json files extracted from pcaps, embed the features and output a libsvm file
#!/usr/bin/python
""" Read json files extracted from pcaps,
embed the features and output a libsvm file.
"""
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import dump_svmlight_file
def tokenize(s):
return s.split("=====")
def main(json_file="../data/tmp/out.json"):
fd = open(json_file, 'r')
requests_json = [obj+'}' for obj in fd.read().split('}\n')[:-1]]
dec = json.JSONDecoder()
requests = [dec.decode(r) for r in requests_json]
raw_data = []
ignore_feats = ["_pcap"]
for r in requests:
x = []
for k,v in r.items():
if k not in ignore_feats:
x.append("__".join([k,str(v)]))
raw_data.append(x)
corpus = ["=====".join(x) for x in raw_data]
vec = CountVectorizer(tokenizer=tokenize,
binary=True,
lowercase=True)
X = vec.fit_transform(corpus)
y = np.random.randint(0, 10, X.shape[0])
dump_svmlight_file(X, y, "out.libsvm", zero_based=False)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment