Skip to content

Instantly share code, notes, and snippets.

@jgraham
Created November 28, 2016 17:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jgraham/be56d2e7c3bd92efb7e0a357953049d6 to your computer and use it in GitHub Desktop.
Save jgraham/be56d2e7c3bd92efb7e0a357953049d6 to your computer and use it in GitHub Desktop.
import cPickle
import re
import os
from collections import defaultdict
import numpy
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from treeherder.model.models import Job
def feature_dict(error_lines):
prev = []
for line in error_lines:
print "line %s" % line.line
features = {"test": None,
"subtest": None,
"action": None,
"status": None,
"expected": None,
"level": None}
if line.failure_line:
failure_line = line.failure_line
for key in features.iterkeys():
features[key] = getattr(failure_line, key)
if failure_line.signature:
features.update(count(tokenize_message(failure_line.signature)))
elif failure_line.message:
features.update(count(tokenize_message(failure_line.message)))
else:
level, status, test, message = split_unstructured(line.line)
features["status"] = status
features["test"] = test
features.update(count(tokenize_message(message)))
for key in features.keys():
if features[key] is None:
del features[key]
prev.append(features)
rev_prev = reversed(prev)
try:
rev_prev.next()
except StopIteration:
pass
for i in xrange(5):
try:
prev_features = rev_prev.next()
except StopIteration:
break
for key, value in prev_features.iteritems():
if key.startswith("token:"):
new_key = "token_%i:%s" % (i, key.split(":", 1)[1])
else:
new_key = "%s_%i" % (key.split("_", 1)[0], i)
features[new_key] = value
print features
yield line, features
re_split = re.compile("\s+")
re_non_word = re.compile("^\W+$")
re_ip = re.compile("\W*\d{1,3}(?:\.\d{1,3}){3}\W*")
re_hex = re.compile("\W*0x[0-9a-fA-F]+\W*")
re_digits = re.compile("\W*\d+\W*")
def tokenize_message(message):
message = message.splitlines()[0]
tokens = re_split.split(message)
rv = []
for token in tokens:
if re_non_word.match(token):
continue
for regexp, replacement in [(re_ip, "<ip_address>"),
(re_hex, "<hex>"),
(re_digits, "<digits>")]:
m = regexp.match(token)
if m:
token = regexp.sub(replacement, token)
break
rv.append(token)
return rv
def count(tokens):
rv = defaultdict(int)
for item in tokens:
rv["token:%s" % item] += 1
return rv
status_re = re.compile(".*TEST-UNEXPECTED-([\w]+)")
log_re = re.compile(".*(DEBUG|INFO|WARNING|ERROR|CRITICAL|FATAL)")
re_unstructured = re.compile("\s*(?:\d\d:?){3}?\s*(.*)")
def split_unstructured(line):
parts = line.split(" | ", 3)
if len(parts) == 3:
level = None
status = None
if "TEST-UNEXPECTED" in parts[0]:
status = status_re.match(parts[0]).group(1)
elif "CRASH" in parts[0]:
status = "CRASH"
else:
m = log_re.match(parts[0])
if m:
level = m.group(1)
test = parts[1]
message = parts[2]
else:
level = None
status = None
test = None
message = re_unstructured.match(line).group(1)
return level, status, test, message
def get_data(jobs, targets=False):
out_features = []
if targets:
out_targets = []
for job in jobs:
for step in job.steps.all():
errors = step.errors.all()
if not len(errors):
continue
for line, features in feature_dict(errors):
# This probably isn't the right way to do this
if line.best_classification and line.best_classification.bug_number is None:
continue
out_features.append(features)
if targets:
out_targets.append(line.best_classification.bug_number
if line.best_classification
else 0)
if targets:
return out_features, numpy.array(out_targets, dtype=numpy.int64)
else:
return out_features
def train(jobs):
vectorizer = DictVectorizer()
features, targets = get_data(jobs, True)
print "Num Features %s" % len(features)
X_train = vectorizer.fit_transform(features)
clf = SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5, random_state=42).fit(X_train, targets)
return vectorizer, clf
def test(vectorizer, clf, jobs):
rv = []
features, expected = get_data(jobs, True)
for i, (test_features, actual) in enumerate(zip(features, expected)):
X = vectorizer.transform([test_features])
res = clf.predict(X)[0]
rv.append({"features": test_features,
"expected": expected[i],
"actual": res})
return rv
def score(data):
total = len(data)
correct = sum(1 for x in data if x["actual"] == x["expected"])
incorrect = total - correct
ratio = float(correct) / total
return {"total": total,
"correct": correct,
"incorrect": incorrect,
"ratio": ratio}
def get_input():
all_jobs = (Job.objects
.filter(steps__errors__best_is_verified=True)
.prefetch_related("steps",
"steps__errors",
"steps__errors__best_classification")
.order_by('id'))
train_data = all_jobs[:len(all_jobs) / 2]
test_data = all_jobs[len(all_jobs) / 2:]
return train_data, test_data
def main():
if not os.path.exists("_ml_cache"):
jobs_train, jobs_test = get_input()
with open("_ml_cache", "wb") as f:
cPickle.dump((jobs_train, jobs_test), f)
else:
with open("_ml_cache", "rb") as f:
jobs_train, jobs_test = cPickle.load(f)
vectorizer, clf = train(jobs_train)
results = test(vectorizer, clf, jobs_test)
print score(results)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment