wasdee/gen_resource_log.sh

## gen_resource_log.sh
#!/usr/bin/env bash

now=$(date +"%Y_%m_%d-%H_%M_%S")
top -b -n 120 > "top-120iters-$now.txt"
# this takes 120 iterations of top, which is 120*3 seconds = 6 minutes

## plot.py
"""
parse top output and plot the resource usage
"""

import re
from dataclasses import dataclass
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


@dataclass
class Position:
    start: int
    end_header: int
    end: Optional[int] = None


@dataclass
class TopIter:
    system_wide: str
    header: str
    positions: Position
    data: str = ""
    df: Optional[pd.DataFrame] = None

    @property
    def time(self):
        match_ = re.match(r"top - (\d+:\d+:\d+)", self.system_wide)[1]
        return datetime.strptime(match_, "%H:%M:%S")

    def generate_dataframe(self):
        if self.data == "":
            raise ValueError("data is empty")

        colnames = self.header.split()

        # find out colspec
        colspecs = [
            self.header.index(f" {value} ") + 1
            if value != "COMMAND"
            else self.header.index(f" {value}") + 1
            for value in colnames
        ]
        colspecs = [[cs, cs + len(colname)] for cs, colname in zip(colspecs, colnames)]

        colspecs[0][0] = 0
        colspecs[-1][1] = 999

        # colspecs scan enlarge colspec idx border width = 1

        col_text_side = "RLRRRRRRRRRL"
        for a, b, c, side in zip(
            colspecs[:-2], colspecs[1:-1], colspecs[2:], col_text_side[1:-1]
        ):
            if side == "R":
                b[0] = a[1] + 1
            elif side == "L":
                b[1] = c[0] - 1

        # enlarge small colspec
        for i, (a, b) in enumerate(colspecs):
            if b - a < 3:
                # since all small colspec is on the right side, we only need to enlarge the right side
                colspecs[i] = [a - 1, b]

        # convert to tuple
        colspecs = [(a, b) for a, b in colspecs]

        df = pd.read_fwf(StringIO(self.data), colspecs=colspecs, names=colnames)

        df["Time"] = self.time
        self.df = df
        return df


def parse_log(filepath):
    filepath = Path(filepath)
    with filepath.open() as f:
        output = f.read()

    top_iters: list[TopIter] = []
    # match system-wide and header first then greedy match data
    pattern = re.compile(r"(top - (.*\n){6})( +PID.*COMMAND)\n")
    for match in pattern.finditer(output):
        system_wide = match[1]
        header = match[3]
        iter_ = TopIter(
            system_wide.strip(),
            header,
            positions=Position(match.start(0), match.end(3)),
        )
        top_iters.append(iter_)

    expect_n_iter = int(re.match(r"top-(\d+)iter.*", filepath.stem)[1])
    assert len(top_iters) == expect_n_iter

    # update end position
    for a, b in zip(top_iters[:-1], top_iters[1:]):
        a.positions.end = b.positions.start
    top_iters[-1].positions.end = len(output)

    # extract data
    for iter_ in top_iters:
        iter_.data = output[iter_.positions.end_header : iter_.positions.end]

    # generate dataframe
    dfs = [iter_.generate_dataframe() for iter_ in top_iters]
    df = pd.concat(dfs, ignore_index=True)

    return df


def plot(df):
    """
    find top 7 cpu usage process(geometric mean) and plot them
    """
    # find top 7 cpu usage process
    df_ = (
        df.groupby(["COMMAND"])
        .agg({"%CPU": lambda x: np.prod(x + 1) - 1})
        .sort_values(["%CPU"], ascending=False)
    )
    top7 = df_.iloc[:7]
    top7_index = top7.index

    # sort by time
    df = df.sort_values(["Time"])

    # plot
    fig, axs = plt.subplots(2, 1, sharex=True, figsize=(20, 10))
    for index in top7_index:
        df_sub = df[df["COMMAND"] == index]
        axs[0].plot(df_sub["Time"], df_sub["%CPU"], label=index)
        axs[1].plot(df_sub["Time"], df_sub["%MEM"], label=index)

    axs[0].legend()
    axs[1].legend()
    axs[0].set_ylabel("CPU(%)")
    axs[1].set_ylabel("MEM(%)")
    axs[1].set_xlabel("Time")
    plt.show()


if "__main__" == __name__:

    # parse all *.txt in current directory
    dfs = []
    for filepath in Path(".").glob("*.txt"):
        df = parse_log(filepath)
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    plot(df)
	#!/usr/bin/env bash

	now=$(date +"%Y_%m_%d-%H_%M_%S")
	top -b -n 120 > "top-120iters-$now.txt"
	# this takes 120 iterations of top, which is 120*3 seconds = 6 minutes
	"""
	parse top output and plot the resource usage
	"""

	import re
	from dataclasses import dataclass
	from datetime import datetime
	from io import StringIO
	from pathlib import Path
	from typing import Optional

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd


	@dataclass
	class Position:
	start: int
	end_header: int
	end: Optional[int] = None


	@dataclass
	class TopIter:
	system_wide: str
	header: str
	positions: Position
	data: str = ""
	df: Optional[pd.DataFrame] = None

	@property
	def time(self):
	match_ = re.match(r"top - (\d+:\d+:\d+)", self.system_wide)[1]
	return datetime.strptime(match_, "%H:%M:%S")

	def generate_dataframe(self):
	if self.data == "":
	raise ValueError("data is empty")

	colnames = self.header.split()

	# find out colspec
	colspecs = [
	self.header.index(f" {value} ") + 1
	if value != "COMMAND"
	else self.header.index(f" {value}") + 1
	for value in colnames
	]
	colspecs = [[cs, cs + len(colname)] for cs, colname in zip(colspecs, colnames)]

	colspecs[0][0] = 0
	colspecs[-1][1] = 999

	# colspecs scan enlarge colspec idx border width = 1

	col_text_side = "RLRRRRRRRRRL"
	for a, b, c, side in zip(
	colspecs[:-2], colspecs[1:-1], colspecs[2:], col_text_side[1:-1]
	):
	if side == "R":
	b[0] = a[1] + 1
	elif side == "L":
	b[1] = c[0] - 1

	# enlarge small colspec
	for i, (a, b) in enumerate(colspecs):
	if b - a < 3:
	# since all small colspec is on the right side, we only need to enlarge the right side
	colspecs[i] = [a - 1, b]

	# convert to tuple
	colspecs = [(a, b) for a, b in colspecs]

	df = pd.read_fwf(StringIO(self.data), colspecs=colspecs, names=colnames)

	df["Time"] = self.time
	self.df = df
	return df


	def parse_log(filepath):
	filepath = Path(filepath)
	with filepath.open() as f:
	output = f.read()

	top_iters: list[TopIter] = []
	# match system-wide and header first then greedy match data
	pattern = re.compile(r"(top - (.\n){6})( +PID.COMMAND)\n")
	for match in pattern.finditer(output):
	system_wide = match[1]
	header = match[3]
	iter_ = TopIter(
	system_wide.strip(),
	header,
	positions=Position(match.start(0), match.end(3)),
	)
	top_iters.append(iter_)

	expect_n_iter = int(re.match(r"top-(\d+)iter.*", filepath.stem)[1])
	assert len(top_iters) == expect_n_iter

	# update end position
	for a, b in zip(top_iters[:-1], top_iters[1:]):
	a.positions.end = b.positions.start
	top_iters[-1].positions.end = len(output)

	# extract data
	for iter_ in top_iters:
	iter_.data = output[iter_.positions.end_header : iter_.positions.end]

	# generate dataframe
	dfs = [iter_.generate_dataframe() for iter_ in top_iters]
	df = pd.concat(dfs, ignore_index=True)

	return df


	def plot(df):
	"""
	find top 7 cpu usage process(geometric mean) and plot them
	"""
	# find top 7 cpu usage process
	df_ = (
	df.groupby(["COMMAND"])
	.agg({"%CPU": lambda x: np.prod(x + 1) - 1})
	.sort_values(["%CPU"], ascending=False)
	)
	top7 = df_.iloc[:7]
	top7_index = top7.index

	# sort by time
	df = df.sort_values(["Time"])

	# plot
	fig, axs = plt.subplots(2, 1, sharex=True, figsize=(20, 10))
	for index in top7_index:
	df_sub = df[df["COMMAND"] == index]
	axs[0].plot(df_sub["Time"], df_sub["%CPU"], label=index)
	axs[1].plot(df_sub["Time"], df_sub["%MEM"], label=index)

	axs[0].legend()
	axs[1].legend()
	axs[0].set_ylabel("CPU(%)")
	axs[1].set_ylabel("MEM(%)")
	axs[1].set_xlabel("Time")
	plt.show()


	if "__main__" == __name__:

	# parse all *.txt in current directory
	dfs = []
	for filepath in Path(".").glob("*.txt"):
	df = parse_log(filepath)
	dfs.append(df)
	df = pd.concat(dfs, ignore_index=True)
	plot(df)