Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jspeed-meyers/866dfe358f55216ede7bb2b373da7aed to your computer and use it in GitHub Desktop.
Save jspeed-meyers/866dfe358f55216ede7bb2b373da7aed to your computer and use it in GitHub Desktop.
Identify OS distribution of docker image list
"""Create docker image distribution dataset."""
import csv
import logging
import re
import subprocess
# potential os locations for distribution data
# info on os-release: https://www.freedesktop.org/software/systemd/man/os-release.html
LOCATIONS = [
"usr/lib/os-release",
"usr/os-release",
"etc/os-release",
"etc/lib/os-release",
]
RESULTS_FILENAME = "docker-image-distribution-results.csv"
FIELDS = ["name", "id", "id_like"] # distribution fields to collect
FIELDNAMES = ["image"] + FIELDS
def parse_data(input):
"""Parse distro data, including ID, and return dict
Args:
input (str) - utf-8 encoded string representing metadata about distro
Returns:
distro_data (dict) - distribution-related data for an image
"""
distro_data = {}
for field in FIELDS:
match = re.search(f"^{field.upper()}=(.*)", input, re.MULTILINE)
if match:
# strip quotes from string
distro_data[field] = match.group(1).replace('"', "")
# standardized method of reporting that no data was available
else:
distro_data[field] = f"no-field-for-{field}"
print(f"{field.upper()}: {distro_data[field]}")
return distro_data
def store_data(distro_data):
"""Store distro-related data in a csv
Args:
distro_data (dict) - distribution-related data for an image
"""
with open(RESULTS_FILENAME, "a", encoding="utf-8", newline="") as file:
writer = csv.DictWriter(file, fieldnames=FIELDNAMES)
writer.writerow(
{
"image": distro_data["image"],
"name": distro_data["name"],
"id": distro_data["id"],
"id_like": distro_data["id_like"],
}
)
if __name__ == "__main__":
# read in list of most popular dockerhub images
images = []
with open("most-popular-dockerhub-images.csv") as csvfile:
r = csv.reader(csvfile)
r.__next__() # skip first row, which is a header
for row in r:
images.append(row[0])
# create new file to store results
with open(RESULTS_FILENAME, "w", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(FIELDNAMES)
# analyze each image
for image in images:
print("\n")
logging.warning(f"image: {image}")
for location in LOCATIONS:
logging.warning(f"file location: {location}")
cmd = f"crane export {image} - | tar -Oxf - {location}"
ps = subprocess.Popen(
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
output = ps.communicate()[0].decode("utf-8")
if f"tar: {location}: Not found in archive" not in output:
logging.warning(f"distro data:{output}")
distro_data = parse_data(output)
distro_data["image"] = image
store_data(distro_data)
continue # since distro data was found, move to next image
# if distro data not found in any location, note no distro data found
store_data(
{
"image": image,
"name": "no-distro-data-found",
"id": "no-distro-data-found",
"id_like": "no-distro-data-found",
}
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment