Created
October 5, 2023 15:45
-
-
Save flaviut/8efa404088f658ec509baba808c7d4e1 to your computer and use it in GitHub Desktop.
Parses an strace output file to determine how long each file spends open, and makes a treemap. This may (depending on your program) be equvilant to how long it takes to read each file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# use with `strace --absolute-timestamps=format:unix,precision:us -o strace.log yourprogram`. | |
import re | |
import csv | |
import os | |
from collections import defaultdict | |
import plotly.express as px | |
def extract_data_from_log(log_lines): | |
# Regular expressions to match the desired lines | |
open_pattern = re.compile(r'(\d+\.\d+) openat\(AT_FDCWD, "(.+)", .+\) = (\d+)') | |
close_pattern = re.compile(r'(\d+\.\d+) close\((\d+)\)') | |
open_timestamps = {} | |
results = [] | |
for line in log_lines: | |
# Check for openat line | |
open_match = open_pattern.match(line) | |
if open_match: | |
timestamp, filename, fd = open_match.groups() | |
open_timestamps[fd] = (timestamp, filename) | |
continue | |
# Check for close line | |
close_match = close_pattern.match(line) | |
if close_match: | |
close_timestamp, fd = close_match.groups() | |
if fd in open_timestamps: | |
open_timestamp, filename = open_timestamps[fd] | |
time_taken_ms = (float(close_timestamp) - float(open_timestamp)) * 1000 | |
results.append((time_taken_ms, filename)) | |
del open_timestamps[fd] | |
return results | |
with open('./strace.log') as log_lines: | |
extracted_data = extract_data_from_log(log_lines) | |
def write_to_csv(data, filename): | |
with open(filename, 'w', newline='') as csvfile: | |
csv_writer = csv.writer(csvfile) | |
csv_writer.writerow(["timeTakenMs", "filename"]) | |
for row in data: | |
csv_writer.writerow(row) | |
write_to_csv(extracted_data, './processed.csv') | |
normalized_data = [(time_taken, os.path.normpath(path)) for time_taken, path in extracted_data] | |
# Initialize dictionary for aggregation | |
aggregated_times = defaultdict(float) | |
# Aggregate time taken for each subpath | |
for time_taken, path in normalized_data: | |
path_parts = path.split('/') | |
subpath = '' | |
for part in path_parts: | |
if subpath: | |
subpath = os.path.join(subpath, part) | |
else: | |
subpath = part | |
aggregated_times[subpath] += time_taken | |
labels = [] | |
parents = [] | |
values = [] | |
for path, val in aggregated_times.items(): | |
labels.append(path) | |
parent = os.path.dirname(path) | |
if parent == path: # For root directory | |
parent = "" | |
parents.append(parent) | |
values.append(val) | |
fig = px.treemap( | |
names=labels, | |
parents=parents, | |
values=values | |
) | |
fig.update_traces(root_color="lightgrey") | |
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25)) | |
fig.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment