John Speed Meyers jspeed-meyers

## pypi_repo_deps2repos_output.txt
(deps2repos) ➜  deps2repos git:(main) ✗ python main.py --no_deps --python top_1000_pypi_packages.txt

WARNING: Some of the packages found on PyPI do not have GitHubs:
protobuf
cffi
docutils
grpcio-status
beautifulsoup4
openpyxl
et-xmlfile

## top_1000_pypi_packages.txt
boto3
urllib3
requests
botocore
charset-normalizer
idna
certifi
setuptools
s3transfer
python-dateutil

## get_top_pypi_packages.py
# script to retrieve most downloaded packages on the python package index
# also, oh yeah, chatgpt wrote some of this. I changed a little though.
# I'm not redundant yet.

import json
import urllib.request

def get_top_packages(top_n=1000):
    """Identify top packages by download count on pypi.

## create_docker_image_distribution_dataset.py
"""Create docker image distribution dataset."""

import csv
import logging
import re
import subprocess

# potential os locations for distribution data
# info on os-release: https://www.freedesktop.org/software/systemd/man/os-release.html
LOCATIONS = [

## create_top_docker_image_dataset.py
"""Create csv of top dockerhub images by popularity.

Part of the dark matter/darkfiles/diffbom analysis

Help from this SO post: https://stackoverflow.com/questions/43426746/api-to-get-top-docker-hub-images

Created by: John Speed Meyers (jsmeyers@chainguard.dev)
"""

import csv

## calculate_cve_reduction.py
"""Calculate percentage reduction in cve's by image
Contact John Speed Meyers or Josh Dolitsky for further information.
"""

import pandas as pd

df = pd.read_csv("rumble-2022-08-16-2022-09-14.csv")

IMAGE_LIST = [
    ["cgr.dev/chainguard/php:latest", "php:latest"],

## clean_rumble_data.py
"""Clean rumble data in preparation for making public.
The latest.csv represents a concatentation of daily vulnerability
scans of image data. This script prepares that csv for making
a subset of this data open source.
"""

import pandas as pd

df = pd.read_csv("latest.csv", parse_dates=["time"])

## calculate_attack_surface_reduction.py
"""Calculate attack surface reduction percentage for pairs of container images.

This script calculates the number of packages present in each image and then
calculates the reduction in "attack surface."

Note: Must install syft (https://github.com/anchore/syft) to use.

Author: John Speed Meyers (jsmeyers@chainguard.dev)
"""

## deps_dev_retrieve_most_depended_upon_packages.sql
DECLARE LatestSnapshot TIMESTAMP;

SET LatestSnapshot = (SELECT MAX(Time) FROM `bigquery-public-data.deps_dev_v1.Snapshots`);

WITH
-- Releases includes every release of every package.
Releases AS (
SELECT
System,
Name,

## create-scorecards-histogram.py
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv("csv/FILENAE.csv")

# create plot
fig, ax = plt.subplots(figsize=(6,4)) # size of sub-figures

n, _, _ = plt.hist(df.score, bins=[i/4 for i in range(0, 40)])
	(deps2repos) ➜ deps2repos git:(main) ✗ python main.py --no_deps --python top_1000_pypi_packages.txt

	WARNING: Some of the packages found on PyPI do not have GitHubs:
	protobuf
	cffi
	docutils
	grpcio-status
	beautifulsoup4
	openpyxl
	et-xmlfile
	boto3
	urllib3
	requests
	botocore
	charset-normalizer
	idna
	certifi
	setuptools
	s3transfer
	python-dateutil
	# script to retrieve most downloaded packages on the python package index
	# also, oh yeah, chatgpt wrote some of this. I changed a little though.
	# I'm not redundant yet.

	import json
	import urllib.request

	def get_top_packages(top_n=1000):
	"""Identify top packages by download count on pypi.
	"""Create docker image distribution dataset."""

	import csv
	import logging
	import re
	import subprocess

	# potential os locations for distribution data
	# info on os-release: https://www.freedesktop.org/software/systemd/man/os-release.html
	LOCATIONS = [
	"""Create csv of top dockerhub images by popularity.

	Part of the dark matter/darkfiles/diffbom analysis

	Help from this SO post: https://stackoverflow.com/questions/43426746/api-to-get-top-docker-hub-images

	Created by: John Speed Meyers (jsmeyers@chainguard.dev)
	"""

	import csv
	"""Calculate percentage reduction in cve's by image
	Contact John Speed Meyers or Josh Dolitsky for further information.
	"""

	import pandas as pd

	df = pd.read_csv("rumble-2022-08-16-2022-09-14.csv")

	IMAGE_LIST = [
	["cgr.dev/chainguard/php:latest", "php:latest"],
	"""Clean rumble data in preparation for making public.
	The latest.csv represents a concatentation of daily vulnerability
	scans of image data. This script prepares that csv for making
	a subset of this data open source.
	"""

	import pandas as pd

	df = pd.read_csv("latest.csv", parse_dates=["time"])
	"""Calculate attack surface reduction percentage for pairs of container images.

	This script calculates the number of packages present in each image and then
	calculates the reduction in "attack surface."

	Note: Must install syft (https://github.com/anchore/syft) to use.

	Author: John Speed Meyers (jsmeyers@chainguard.dev)
	"""
	DECLARE LatestSnapshot TIMESTAMP;

	SET LatestSnapshot = (SELECT MAX(Time) FROM `bigquery-public-data.deps_dev_v1.Snapshots`);

	WITH
	-- Releases includes every release of every package.
	Releases AS (
	SELECT
	System,
	Name,
	import matplotlib.pyplot as plt
	import pandas as pd

	df = pd.read_csv("csv/FILENAE.csv")

	# create plot
	fig, ax = plt.subplots(figsize=(6,4)) # size of sub-figures

	n, _, _ = plt.hist(df.score, bins=[i/4 for i in range(0, 40)])