jspeed-meyers/clean_rumble_data.py

## clean_rumble_data.py
"""Clean rumble data in preparation for making public.
The latest.csv represents a concatentation of daily vulnerability
scans of image data. This script prepares that csv for making
a subset of this data open source.
"""

import pandas as pd

df = pd.read_csv("latest.csv", parse_dates=["time"])

# Filter in only trivy scan results
df = df[df["scanner"] == "trivy"]

# Filter in observations between certain dates
filtered_df = df[(df["time"] >= "2022-08-16") & (df["time"] <= "2022-09-15")]

# filter in only nginx, php, and go images
# (both chainguard version and Dockerhub equivalent)
IMAGE_LIST = [
    "distroless.dev/php:latest",
    "distroless.dev/go:latest",
    "distroless.dev/nginx:latest",
    "php:latest",
    "nginx:latest",
    "golang:latest",
]
filtered_df = filtered_df[filtered_df["image"].isin(IMAGE_LIST)]

# rename distroless.dev --> cgr.dev/chainguard
SUBSTITUTION_LIST = {
    "distroless.dev/php:latest": "cgr.dev/chainguard/php:latest",
    "distroless.dev/go:latest": "cgr.dev/chainguard/go:latest",
    "distroless.dev/nginx:latest": "cgr.dev/chainguard/nginx:latest",
}
renamed_df = filtered_df.replace(SUBSTITUTION_LIST)

# drop "success" column since that is only interesting for
# internal chainguard quality control purposes
# drop negligible_cve_cnt since that was a grype-related column and
# doesn't apply to trivy scans
renamed_df = renamed_df.drop(columns=["success", "negligible_cve_cnt"])

# export data
# use naming scheme of rumble-{earliest-date}-{latest-date}.csv
earliest_date = renamed_df["time"].min().date()
latest_date = renamed_df["time"].max().date()

renamed_df.to_csv(f"rumble-{earliest_date}-{latest_date}.csv", index=False)
	"""Clean rumble data in preparation for making public.
	The latest.csv represents a concatentation of daily vulnerability
	scans of image data. This script prepares that csv for making
	a subset of this data open source.
	"""

	import pandas as pd

	df = pd.read_csv("latest.csv", parse_dates=["time"])

	# Filter in only trivy scan results
	df = df[df["scanner"] == "trivy"]

	# Filter in observations between certain dates
	filtered_df = df[(df["time"] >= "2022-08-16") & (df["time"] <= "2022-09-15")]

	# filter in only nginx, php, and go images
	# (both chainguard version and Dockerhub equivalent)
	IMAGE_LIST = [
	"distroless.dev/php:latest",
	"distroless.dev/go:latest",
	"distroless.dev/nginx:latest",
	"php:latest",
	"nginx:latest",
	"golang:latest",
	]
	filtered_df = filtered_df[filtered_df["image"].isin(IMAGE_LIST)]

	# rename distroless.dev --> cgr.dev/chainguard
	SUBSTITUTION_LIST = {
	"distroless.dev/php:latest": "cgr.dev/chainguard/php:latest",
	"distroless.dev/go:latest": "cgr.dev/chainguard/go:latest",
	"distroless.dev/nginx:latest": "cgr.dev/chainguard/nginx:latest",
	}
	renamed_df = filtered_df.replace(SUBSTITUTION_LIST)

	# drop "success" column since that is only interesting for
	# internal chainguard quality control purposes
	# drop negligible_cve_cnt since that was a grype-related column and
	# doesn't apply to trivy scans
	renamed_df = renamed_df.drop(columns=["success", "negligible_cve_cnt"])

	# export data
	# use naming scheme of rumble-{earliest-date}-{latest-date}.csv
	earliest_date = renamed_df["time"].min().date()
	latest_date = renamed_df["time"].max().date()

	renamed_df.to_csv(f"rumble-{earliest_date}-{latest_date}.csv", index=False)