jspeed-meyers/create_docker_image_distribution_dataset.py

## create_docker_image_distribution_dataset.py
"""Create docker image distribution dataset."""

import csv
import logging
import re
import subprocess

# potential os locations for distribution data
# info on os-release: https://www.freedesktop.org/software/systemd/man/os-release.html
LOCATIONS = [
    "usr/lib/os-release",
    "usr/os-release",
    "etc/os-release",
    "etc/lib/os-release",
]
RESULTS_FILENAME = "docker-image-distribution-results.csv"
FIELDS = ["name", "id", "id_like"]  # distribution fields to collect
FIELDNAMES = ["image"] + FIELDS


def parse_data(input):
    """Parse distro data, including ID, and return dict

    Args:
        input (str) - utf-8 encoded string representing metadata about distro

    Returns:
        distro_data (dict) - distribution-related data for an image
    """
    distro_data = {}
    for field in FIELDS:
        match = re.search(f"^{field.upper()}=(.*)", input, re.MULTILINE)
        if match:
            # strip quotes from string
            distro_data[field] = match.group(1).replace('"', "")
        # standardized method of reporting that no data was available
        else:
            distro_data[field] = f"no-field-for-{field}"
        print(f"{field.upper()}: {distro_data[field]}")

    return distro_data


def store_data(distro_data):
    """Store distro-related data in a csv

    Args:
        distro_data (dict) - distribution-related data for an image

    """
    with open(RESULTS_FILENAME, "a", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=FIELDNAMES)
        writer.writerow(
            {
                "image": distro_data["image"],
                "name": distro_data["name"],
                "id": distro_data["id"],
                "id_like": distro_data["id_like"],
            }
        )


if __name__ == "__main__":

    # read in list of most popular dockerhub images
    images = []
    with open("most-popular-dockerhub-images.csv") as csvfile:
        r = csv.reader(csvfile)
        r.__next__()  # skip first row, which is a header
        for row in r:
            images.append(row[0])

    # create new file to store results
    with open(RESULTS_FILENAME, "w", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(FIELDNAMES)

    # analyze each image
    for image in images:
        print("\n")
        logging.warning(f"image: {image}")
        for location in LOCATIONS:
            logging.warning(f"file location: {location}")
            cmd = f"crane export {image} - | tar -Oxf - {location}"
            ps = subprocess.Popen(
                cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
            )
            output = ps.communicate()[0].decode("utf-8")
            if f"tar: {location}: Not found in archive" not in output:
                logging.warning(f"distro data:{output}")
                distro_data = parse_data(output)
                distro_data["image"] = image
                store_data(distro_data)
                continue  # since distro data was found, move to next image
        # if distro data not found in any location, note no distro data found
        store_data(
            {
                "image": image,
                "name": "no-distro-data-found",
                "id": "no-distro-data-found",
                "id_like": "no-distro-data-found",
            }
        )
	"""Create docker image distribution dataset."""

	import csv
	import logging
	import re
	import subprocess

	# potential os locations for distribution data
	# info on os-release: https://www.freedesktop.org/software/systemd/man/os-release.html
	LOCATIONS = [
	"usr/lib/os-release",
	"usr/os-release",
	"etc/os-release",
	"etc/lib/os-release",
	]
	RESULTS_FILENAME = "docker-image-distribution-results.csv"
	FIELDS = ["name", "id", "id_like"] # distribution fields to collect
	FIELDNAMES = ["image"] + FIELDS


	def parse_data(input):
	"""Parse distro data, including ID, and return dict

	Args:
	input (str) - utf-8 encoded string representing metadata about distro

	Returns:
	distro_data (dict) - distribution-related data for an image
	"""
	distro_data = {}
	for field in FIELDS:
	match = re.search(f"^{field.upper()}=(.*)", input, re.MULTILINE)
	if match:
	# strip quotes from string
	distro_data[field] = match.group(1).replace('"', "")
	# standardized method of reporting that no data was available
	else:
	distro_data[field] = f"no-field-for-{field}"
	print(f"{field.upper()}: {distro_data[field]}")

	return distro_data


	def store_data(distro_data):
	"""Store distro-related data in a csv

	Args:
	distro_data (dict) - distribution-related data for an image

	"""
	with open(RESULTS_FILENAME, "a", encoding="utf-8", newline="") as file:
	writer = csv.DictWriter(file, fieldnames=FIELDNAMES)
	writer.writerow(
	{
	"image": distro_data["image"],
	"name": distro_data["name"],
	"id": distro_data["id"],
	"id_like": distro_data["id_like"],
	}
	)


	if __name__ == "__main__":

	# read in list of most popular dockerhub images
	images = []
	with open("most-popular-dockerhub-images.csv") as csvfile:
	r = csv.reader(csvfile)
	r.__next__() # skip first row, which is a header
	for row in r:
	images.append(row[0])

	# create new file to store results
	with open(RESULTS_FILENAME, "w", encoding="utf-8") as file:
	writer = csv.writer(file)
	writer.writerow(FIELDNAMES)

	# analyze each image
	for image in images:
	print("\n")
	logging.warning(f"image: {image}")
	for location in LOCATIONS:
	logging.warning(f"file location: {location}")
	cmd = f"crane export {image} - \| tar -Oxf - {location}"
	ps = subprocess.Popen(
	cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
	)
	output = ps.communicate()[0].decode("utf-8")
	if f"tar: {location}: Not found in archive" not in output:
	logging.warning(f"distro data:{output}")
	distro_data = parse_data(output)
	distro_data["image"] = image
	store_data(distro_data)
	continue # since distro data was found, move to next image
	# if distro data not found in any location, note no distro data found
	store_data(
	{
	"image": image,
	"name": "no-distro-data-found",
	"id": "no-distro-data-found",
	"id_like": "no-distro-data-found",
	}
	)