Skip to content

Instantly share code, notes, and snippets.

@chrisburr
Last active June 2, 2017 11:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrisburr/ba59fa2eeef03f34fceae46a83bdd8b8 to your computer and use it in GitHub Desktop.
Save chrisburr/ba59fa2eeef03f34fceae46a83bdd8b8 to your computer and use it in GitHub Desktop.
strace parser based on fabricate - https://github.com/SimonAlfie/fabricate
#!/usr/bin/env python
import sys
import os
import re
__all__ = [
'StraceLog'
]
# Regular expressions for parsing of strace log
_open_re = re.compile(r'(?P<pid>\d+)\s+open\("(?P<name>[^"]*)", (?P<mode>[^,)]*)')
_stat_re = re.compile(r'(?P<pid>\d+)\s+l?stat(?:64)?\("(?P<name>[^"]*)", .*') # stat,lstat,stat64,lstat64
_execve_re = re.compile(r'(?P<pid>\d+)\s+execve\("(?P<name>[^"]*)", .*')
_creat_re = re.compile(r'(?P<pid>\d+)\s+creat\("(?P<name>[^"]*)", .*')
_mkdir_re = re.compile(r'(?P<pid>\d+)\s+mkdir\("(?P<name>[^"]*)", .*\)\s*=\s(?P<result>-?[0-9]*).*')
_rename_re = re.compile(r'(?P<pid>\d+)\s+rename\("[^"]*", "(?P<name>[^"]*)"\)')
_symlink_re = re.compile(r'(?P<pid>\d+)\s+symlink\("[^"]*", "(?P<name>[^"]*)"\)')
_kill_re = re.compile(r'(?P<pid>\d+)\s+killed by.*')
_chdir_re = re.compile(r'(?P<pid>\d+)\s+chdir\("(?P<cwd>[^"]*)"\)')
_exit_group_re = re.compile(r'(?P<pid>\d+)\s+exit_group\((?P<status>.*)\).*')
_clone_re = re.compile(r'(?P<pid_clone>\d+)\s+(clone|fork|vfork)\(.*\)\s*=\s*(?P<pid>\d*)')
# Regular expressions for detecting interrupted lines in strace log
# 3618 clone( <unfinished ...>
# 3618 <... clone resumed> child_stack=0, flags=CLONE, child_tidptr=0x7f83deffa780) = 3622
_unfinished_start_re = re.compile(r'(?P<pid>\d+)(?P<body>.*)<unfinished ...>$')
_unfinished_end_re = re.compile(r'(?P<pid>\d+)\s+\<\.\.\..*\>(?P<body>.*)')
class StraceProcess(object):
def __init__(self, cwd='.', delayed=False):
self.cwd = cwd
self.deps = set()
self.outputs = set()
self.delayed = delayed
self.delayed_lines = []
def add_dep(self, dep):
self.deps.add(dep)
def add_output(self, output):
self.outputs.add(output)
def add_delayed_line(self, line):
self.delayed_lines.append(line)
def __str__(self):
return '<StraceProcess cwd=%s deps=%s outputs=%s>' % \
(self.cwd, self.deps, self.outputs)
class StraceLog(object):
def __init__(self, base_dir, included_dirs=None, max_dir_depth=100,
ignore_patterns=None, ignored_prefixes=['.']):
if included_dirs is None:
included_dirs = [base_dir]
if ignore_patterns is None:
ignore_patterns = []
self.base_dir = base_dir
self.included_dirs = included_dirs
self.ignore_patterns = ignore_patterns
self.max_dir_depth = max_dir_depth
self.ignored_prefixes = ignored_prefixes
def parse_trace(self, trace_filename):
# if strace failed to run, re-throw the exception
# we can tell this happened if the file is empty
if os.stat(trace_filename).st_size == 0 is 0:
raise # previous exception
self.status = 0
processes = {} # dictionary of processes (key = pid)
unfinished = {} # list of interrupted entries in strace log
with open(trace_filename, 'rt') as fp:
for line in fp:
self._match_line(line, processes, unfinished)
# collect outputs and dependencies from all processes
deps = set()
outputs = set()
for pid, process in processes.items():
deps = deps.union(process.deps)
outputs = outputs.union(process.outputs)
# If a file is in inputs and outputs remove it from inputs
# TODO Is this a bad idea?
deps = deps - outputs
print('*'*100)
print('Status = ', self.status)
print('*'*43, 'Dependencies', '*'*43)
print(list(deps))
print('*'*45, 'Outputs', '*'*46)
print(list(outputs))
print('*'*100)
return deps, outputs
def _is_relevant(self, filename):
""" Return True if file is in the dependency search directories. """
absolute_filename = os.path.abspath(filename)
# Ignore directories
if os.path.isdir(absolute_filename):
return False
# Check if the file is in one of the included directories
is_in_search_directory = False
for path in self.included_dirs:
path = os.path.abspath(path)
if absolute_filename.startswith(path):
rest = absolute_filename[len(path):]
# Skip files in directories starting with an ignored prefix
if any(os.sep+ignore_prefix in os.sep+os.path.dirname(rest)
for ignore_prefix in self.ignored_prefixes):
continue
# Skip files deeper than max_dir_depth
if rest.count(os.sep) > self.max_dir_depth:
continue
is_in_search_directory = True
break
if not is_in_search_directory:
return False
# Ensure the file doesn't match any of the ignore patterns
if any(i.search(filename) for i in self.ignore_patterns):
return False
# The file is only relevant if it still exists
return os.path.lexists(filename)
def _match_line(self, line, processes, unfinished):
# look for split lines
unfinished_start_match = _unfinished_start_re.match(line)
unfinished_end_match = _unfinished_end_re.match(line)
if unfinished_start_match:
pid = unfinished_start_match.group('pid')
body = unfinished_start_match.group('body')
unfinished[pid] = pid + ' ' + body
return
elif unfinished_end_match:
pid = unfinished_end_match.group('pid')
body = unfinished_end_match.group('body')
if pid not in unfinished:
# Looks like we need to handle an strace bug here
# I think it is safe to ignore as I have only seen futex calls
# which strace should not output
print('Warning: Resume without unfinished in strace output '
'(strace bug?)', line.strip())
return
line = unfinished[pid] + body
del unfinished[pid]
is_output = False
open_match = _open_re.match(line)
stat_match = _stat_re.match(line)
execve_match = _execve_re.match(line)
creat_match = _creat_re.match(line)
mkdir_match = _mkdir_re.match(line)
symlink_match = _symlink_re.match(line)
rename_match = _rename_re.match(line)
clone_match = _clone_re.match(line)
kill_match = _kill_re.match(line)
if kill_match:
return None, None, None
match = None
if execve_match:
pid = execve_match.group('pid')
match = execve_match # Executables can be dependencies
if pid not in processes and len(processes) == 0:
# This is the first process so create dict entry
processes[pid] = StraceProcess()
elif clone_match:
pid = clone_match.group('pid')
pid_clone = clone_match.group('pid_clone')
if pid not in processes:
# Simple case where there are no delayed lines
processes[pid] = StraceProcess(processes[pid_clone].cwd)
else:
# Some line processing was delayed due to an interupted clone_match
processes[pid].cwd = processes[pid_clone].cwd # Set the correct cwd
processes[pid].delayed = False # Set that matching is no longer delayed
for delayed_line in processes[pid].delayed_lines:
# Process all the delayed lines
self._match_line(delayed_line, processes, unfinished)
processes[pid].delayed_lines = [] # Clear the lines
elif open_match:
match = open_match
mode = match.group('mode')
if 'O_WRONLY' in mode or 'O_RDWR' in mode:
# it's an output file if opened for writing
is_output = True
elif stat_match:
match = stat_match
elif creat_match:
match = creat_match
# a created file is an output file
is_output = True
elif mkdir_match:
match = mkdir_match
if match.group('result') == '0':
# a created directory is an output file
is_output = True
elif symlink_match:
match = symlink_match
# the created symlink is an output file
is_output = True
elif rename_match:
match = rename_match
# the destination of a rename is an output file
is_output = True
if match:
name = match.group('name')
pid = match.group('pid')
if not self._matching_is_delayed(processes, pid, line):
cwd = processes[pid].cwd
if cwd != '.':
name = os.path.join(cwd, name)
# Normalise path name to ensure files are only listed once
name = os.path.normpath(name)
# if it's an absolute path name under the build directory,
# make it relative to base_dir before saving to .deps file
if os.path.isabs(name) and name.startswith(self.base_dir):
name = name[len(self.base_dir):]
name = name.lstrip(os.path.sep)
if self._is_relevant(name):
if is_output:
processes[pid].add_output(name)
else:
processes[pid].add_dep(name)
match = _chdir_re.match(line)
if match:
pid = match.group('pid')
if not self._matching_is_delayed(processes, pid, line):
processes[pid].cwd = os.path.join(processes[pid].cwd, match.group('cwd'))
match = _exit_group_re.match(line)
if match:
self.status = int(match.group('status'))
def _matching_is_delayed(self, processes, pid, line):
# Check if matching is delayed and cache a delayed line
if pid not in processes:
processes[pid] = StraceProcess(delayed=True)
process = processes[pid]
if process.delayed:
process.add_delayed_line(line)
return True
else:
return False
if __name__ == '__main__':
StraceLog('/repo/tmp').parse_trace(sys.argv[1])
rule all:
output:
'a.txt'
run:
with open('a.txt', 'wt') as fp:
fp.write('AAA')
with open('b.txt', 'wt') as fp:
fp.write('BBB')
with open('c.txt', 'wt') as fp:
fp.write('CCC')
# Run using:
# strace -fo test.tracelog -e \
# trace='open,stat,stat64,lstat,lstat64,execve,exit_group,chdir,mkdir,rename,clone,vfork,fork,symlink,creat' \
# snakemake --force
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment