Created
September 25, 2022 22:49
-
-
Save jspeed-meyers/6b6540cd2fd7962622591a8f4b061388 to your computer and use it in GitHub Desktop.
Clean rumble latest.csv for making data public
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Clean rumble data in preparation for making public. | |
The latest.csv represents a concatentation of daily vulnerability | |
scans of image data. This script prepares that csv for making | |
a subset of this data open source. | |
""" | |
import pandas as pd | |
df = pd.read_csv("latest.csv", parse_dates=["time"]) | |
# Filter in only trivy scan results | |
df = df[df["scanner"] == "trivy"] | |
# Filter in observations between certain dates | |
filtered_df = df[(df["time"] >= "2022-08-16") & (df["time"] <= "2022-09-15")] | |
# filter in only nginx, php, and go images | |
# (both chainguard version and Dockerhub equivalent) | |
IMAGE_LIST = [ | |
"distroless.dev/php:latest", | |
"distroless.dev/go:latest", | |
"distroless.dev/nginx:latest", | |
"php:latest", | |
"nginx:latest", | |
"golang:latest", | |
] | |
filtered_df = filtered_df[filtered_df["image"].isin(IMAGE_LIST)] | |
# rename distroless.dev --> cgr.dev/chainguard | |
SUBSTITUTION_LIST = { | |
"distroless.dev/php:latest": "cgr.dev/chainguard/php:latest", | |
"distroless.dev/go:latest": "cgr.dev/chainguard/go:latest", | |
"distroless.dev/nginx:latest": "cgr.dev/chainguard/nginx:latest", | |
} | |
renamed_df = filtered_df.replace(SUBSTITUTION_LIST) | |
# drop "success" column since that is only interesting for | |
# internal chainguard quality control purposes | |
# drop negligible_cve_cnt since that was a grype-related column and | |
# doesn't apply to trivy scans | |
renamed_df = renamed_df.drop(columns=["success", "negligible_cve_cnt"]) | |
# export data | |
# use naming scheme of rumble-{earliest-date}-{latest-date}.csv | |
earliest_date = renamed_df["time"].min().date() | |
latest_date = renamed_df["time"].max().date() | |
renamed_df.to_csv(f"rumble-{earliest_date}-{latest_date}.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment