Last active
June 2, 2017 11:01
-
-
Save chrisburr/ba59fa2eeef03f34fceae46a83bdd8b8 to your computer and use it in GitHub Desktop.
strace parser based on fabricate - https://github.com/SimonAlfie/fabricate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import os | |
import re | |
__all__ = [ | |
'StraceLog' | |
] | |
# Regular expressions for parsing of strace log | |
_open_re = re.compile(r'(?P<pid>\d+)\s+open\("(?P<name>[^"]*)", (?P<mode>[^,)]*)') | |
_stat_re = re.compile(r'(?P<pid>\d+)\s+l?stat(?:64)?\("(?P<name>[^"]*)", .*') # stat,lstat,stat64,lstat64 | |
_execve_re = re.compile(r'(?P<pid>\d+)\s+execve\("(?P<name>[^"]*)", .*') | |
_creat_re = re.compile(r'(?P<pid>\d+)\s+creat\("(?P<name>[^"]*)", .*') | |
_mkdir_re = re.compile(r'(?P<pid>\d+)\s+mkdir\("(?P<name>[^"]*)", .*\)\s*=\s(?P<result>-?[0-9]*).*') | |
_rename_re = re.compile(r'(?P<pid>\d+)\s+rename\("[^"]*", "(?P<name>[^"]*)"\)') | |
_symlink_re = re.compile(r'(?P<pid>\d+)\s+symlink\("[^"]*", "(?P<name>[^"]*)"\)') | |
_kill_re = re.compile(r'(?P<pid>\d+)\s+killed by.*') | |
_chdir_re = re.compile(r'(?P<pid>\d+)\s+chdir\("(?P<cwd>[^"]*)"\)') | |
_exit_group_re = re.compile(r'(?P<pid>\d+)\s+exit_group\((?P<status>.*)\).*') | |
_clone_re = re.compile(r'(?P<pid_clone>\d+)\s+(clone|fork|vfork)\(.*\)\s*=\s*(?P<pid>\d*)') | |
# Regular expressions for detecting interrupted lines in strace log | |
# 3618 clone( <unfinished ...> | |
# 3618 <... clone resumed> child_stack=0, flags=CLONE, child_tidptr=0x7f83deffa780) = 3622 | |
_unfinished_start_re = re.compile(r'(?P<pid>\d+)(?P<body>.*)<unfinished ...>$') | |
_unfinished_end_re = re.compile(r'(?P<pid>\d+)\s+\<\.\.\..*\>(?P<body>.*)') | |
class StraceProcess(object): | |
def __init__(self, cwd='.', delayed=False): | |
self.cwd = cwd | |
self.deps = set() | |
self.outputs = set() | |
self.delayed = delayed | |
self.delayed_lines = [] | |
def add_dep(self, dep): | |
self.deps.add(dep) | |
def add_output(self, output): | |
self.outputs.add(output) | |
def add_delayed_line(self, line): | |
self.delayed_lines.append(line) | |
def __str__(self): | |
return '<StraceProcess cwd=%s deps=%s outputs=%s>' % \ | |
(self.cwd, self.deps, self.outputs) | |
class StraceLog(object): | |
def __init__(self, base_dir, included_dirs=None, max_dir_depth=100, | |
ignore_patterns=None, ignored_prefixes=['.']): | |
if included_dirs is None: | |
included_dirs = [base_dir] | |
if ignore_patterns is None: | |
ignore_patterns = [] | |
self.base_dir = base_dir | |
self.included_dirs = included_dirs | |
self.ignore_patterns = ignore_patterns | |
self.max_dir_depth = max_dir_depth | |
self.ignored_prefixes = ignored_prefixes | |
def parse_trace(self, trace_filename): | |
# if strace failed to run, re-throw the exception | |
# we can tell this happened if the file is empty | |
if os.stat(trace_filename).st_size == 0 is 0: | |
raise # previous exception | |
self.status = 0 | |
processes = {} # dictionary of processes (key = pid) | |
unfinished = {} # list of interrupted entries in strace log | |
with open(trace_filename, 'rt') as fp: | |
for line in fp: | |
self._match_line(line, processes, unfinished) | |
# collect outputs and dependencies from all processes | |
deps = set() | |
outputs = set() | |
for pid, process in processes.items(): | |
deps = deps.union(process.deps) | |
outputs = outputs.union(process.outputs) | |
# If a file is in inputs and outputs remove it from inputs | |
# TODO Is this a bad idea? | |
deps = deps - outputs | |
print('*'*100) | |
print('Status = ', self.status) | |
print('*'*43, 'Dependencies', '*'*43) | |
print(list(deps)) | |
print('*'*45, 'Outputs', '*'*46) | |
print(list(outputs)) | |
print('*'*100) | |
return deps, outputs | |
def _is_relevant(self, filename): | |
""" Return True if file is in the dependency search directories. """ | |
absolute_filename = os.path.abspath(filename) | |
# Ignore directories | |
if os.path.isdir(absolute_filename): | |
return False | |
# Check if the file is in one of the included directories | |
is_in_search_directory = False | |
for path in self.included_dirs: | |
path = os.path.abspath(path) | |
if absolute_filename.startswith(path): | |
rest = absolute_filename[len(path):] | |
# Skip files in directories starting with an ignored prefix | |
if any(os.sep+ignore_prefix in os.sep+os.path.dirname(rest) | |
for ignore_prefix in self.ignored_prefixes): | |
continue | |
# Skip files deeper than max_dir_depth | |
if rest.count(os.sep) > self.max_dir_depth: | |
continue | |
is_in_search_directory = True | |
break | |
if not is_in_search_directory: | |
return False | |
# Ensure the file doesn't match any of the ignore patterns | |
if any(i.search(filename) for i in self.ignore_patterns): | |
return False | |
# The file is only relevant if it still exists | |
return os.path.lexists(filename) | |
def _match_line(self, line, processes, unfinished): | |
# look for split lines | |
unfinished_start_match = _unfinished_start_re.match(line) | |
unfinished_end_match = _unfinished_end_re.match(line) | |
if unfinished_start_match: | |
pid = unfinished_start_match.group('pid') | |
body = unfinished_start_match.group('body') | |
unfinished[pid] = pid + ' ' + body | |
return | |
elif unfinished_end_match: | |
pid = unfinished_end_match.group('pid') | |
body = unfinished_end_match.group('body') | |
if pid not in unfinished: | |
# Looks like we need to handle an strace bug here | |
# I think it is safe to ignore as I have only seen futex calls | |
# which strace should not output | |
print('Warning: Resume without unfinished in strace output ' | |
'(strace bug?)', line.strip()) | |
return | |
line = unfinished[pid] + body | |
del unfinished[pid] | |
is_output = False | |
open_match = _open_re.match(line) | |
stat_match = _stat_re.match(line) | |
execve_match = _execve_re.match(line) | |
creat_match = _creat_re.match(line) | |
mkdir_match = _mkdir_re.match(line) | |
symlink_match = _symlink_re.match(line) | |
rename_match = _rename_re.match(line) | |
clone_match = _clone_re.match(line) | |
kill_match = _kill_re.match(line) | |
if kill_match: | |
return None, None, None | |
match = None | |
if execve_match: | |
pid = execve_match.group('pid') | |
match = execve_match # Executables can be dependencies | |
if pid not in processes and len(processes) == 0: | |
# This is the first process so create dict entry | |
processes[pid] = StraceProcess() | |
elif clone_match: | |
pid = clone_match.group('pid') | |
pid_clone = clone_match.group('pid_clone') | |
if pid not in processes: | |
# Simple case where there are no delayed lines | |
processes[pid] = StraceProcess(processes[pid_clone].cwd) | |
else: | |
# Some line processing was delayed due to an interupted clone_match | |
processes[pid].cwd = processes[pid_clone].cwd # Set the correct cwd | |
processes[pid].delayed = False # Set that matching is no longer delayed | |
for delayed_line in processes[pid].delayed_lines: | |
# Process all the delayed lines | |
self._match_line(delayed_line, processes, unfinished) | |
processes[pid].delayed_lines = [] # Clear the lines | |
elif open_match: | |
match = open_match | |
mode = match.group('mode') | |
if 'O_WRONLY' in mode or 'O_RDWR' in mode: | |
# it's an output file if opened for writing | |
is_output = True | |
elif stat_match: | |
match = stat_match | |
elif creat_match: | |
match = creat_match | |
# a created file is an output file | |
is_output = True | |
elif mkdir_match: | |
match = mkdir_match | |
if match.group('result') == '0': | |
# a created directory is an output file | |
is_output = True | |
elif symlink_match: | |
match = symlink_match | |
# the created symlink is an output file | |
is_output = True | |
elif rename_match: | |
match = rename_match | |
# the destination of a rename is an output file | |
is_output = True | |
if match: | |
name = match.group('name') | |
pid = match.group('pid') | |
if not self._matching_is_delayed(processes, pid, line): | |
cwd = processes[pid].cwd | |
if cwd != '.': | |
name = os.path.join(cwd, name) | |
# Normalise path name to ensure files are only listed once | |
name = os.path.normpath(name) | |
# if it's an absolute path name under the build directory, | |
# make it relative to base_dir before saving to .deps file | |
if os.path.isabs(name) and name.startswith(self.base_dir): | |
name = name[len(self.base_dir):] | |
name = name.lstrip(os.path.sep) | |
if self._is_relevant(name): | |
if is_output: | |
processes[pid].add_output(name) | |
else: | |
processes[pid].add_dep(name) | |
match = _chdir_re.match(line) | |
if match: | |
pid = match.group('pid') | |
if not self._matching_is_delayed(processes, pid, line): | |
processes[pid].cwd = os.path.join(processes[pid].cwd, match.group('cwd')) | |
match = _exit_group_re.match(line) | |
if match: | |
self.status = int(match.group('status')) | |
def _matching_is_delayed(self, processes, pid, line): | |
# Check if matching is delayed and cache a delayed line | |
if pid not in processes: | |
processes[pid] = StraceProcess(delayed=True) | |
process = processes[pid] | |
if process.delayed: | |
process.add_delayed_line(line) | |
return True | |
else: | |
return False | |
if __name__ == '__main__': | |
StraceLog('/repo/tmp').parse_trace(sys.argv[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rule all: | |
output: | |
'a.txt' | |
run: | |
with open('a.txt', 'wt') as fp: | |
fp.write('AAA') | |
with open('b.txt', 'wt') as fp: | |
fp.write('BBB') | |
with open('c.txt', 'wt') as fp: | |
fp.write('CCC') | |
# Run using: | |
# strace -fo test.tracelog -e \ | |
# trace='open,stat,stat64,lstat,lstat64,execve,exit_group,chdir,mkdir,rename,clone,vfork,fork,symlink,creat' \ | |
# snakemake --force |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment