Created
October 24, 2022 15:11
-
-
Save jspeed-meyers/866dfe358f55216ede7bb2b373da7aed to your computer and use it in GitHub Desktop.
Identify OS distribution of docker image list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Create docker image distribution dataset.""" | |
import csv | |
import logging | |
import re | |
import subprocess | |
# potential os locations for distribution data | |
# info on os-release: https://www.freedesktop.org/software/systemd/man/os-release.html | |
LOCATIONS = [ | |
"usr/lib/os-release", | |
"usr/os-release", | |
"etc/os-release", | |
"etc/lib/os-release", | |
] | |
RESULTS_FILENAME = "docker-image-distribution-results.csv" | |
FIELDS = ["name", "id", "id_like"] # distribution fields to collect | |
FIELDNAMES = ["image"] + FIELDS | |
def parse_data(input): | |
"""Parse distro data, including ID, and return dict | |
Args: | |
input (str) - utf-8 encoded string representing metadata about distro | |
Returns: | |
distro_data (dict) - distribution-related data for an image | |
""" | |
distro_data = {} | |
for field in FIELDS: | |
match = re.search(f"^{field.upper()}=(.*)", input, re.MULTILINE) | |
if match: | |
# strip quotes from string | |
distro_data[field] = match.group(1).replace('"', "") | |
# standardized method of reporting that no data was available | |
else: | |
distro_data[field] = f"no-field-for-{field}" | |
print(f"{field.upper()}: {distro_data[field]}") | |
return distro_data | |
def store_data(distro_data): | |
"""Store distro-related data in a csv | |
Args: | |
distro_data (dict) - distribution-related data for an image | |
""" | |
with open(RESULTS_FILENAME, "a", encoding="utf-8", newline="") as file: | |
writer = csv.DictWriter(file, fieldnames=FIELDNAMES) | |
writer.writerow( | |
{ | |
"image": distro_data["image"], | |
"name": distro_data["name"], | |
"id": distro_data["id"], | |
"id_like": distro_data["id_like"], | |
} | |
) | |
if __name__ == "__main__": | |
# read in list of most popular dockerhub images | |
images = [] | |
with open("most-popular-dockerhub-images.csv") as csvfile: | |
r = csv.reader(csvfile) | |
r.__next__() # skip first row, which is a header | |
for row in r: | |
images.append(row[0]) | |
# create new file to store results | |
with open(RESULTS_FILENAME, "w", encoding="utf-8") as file: | |
writer = csv.writer(file) | |
writer.writerow(FIELDNAMES) | |
# analyze each image | |
for image in images: | |
print("\n") | |
logging.warning(f"image: {image}") | |
for location in LOCATIONS: | |
logging.warning(f"file location: {location}") | |
cmd = f"crane export {image} - | tar -Oxf - {location}" | |
ps = subprocess.Popen( | |
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT | |
) | |
output = ps.communicate()[0].decode("utf-8") | |
if f"tar: {location}: Not found in archive" not in output: | |
logging.warning(f"distro data:{output}") | |
distro_data = parse_data(output) | |
distro_data["image"] = image | |
store_data(distro_data) | |
continue # since distro data was found, move to next image | |
# if distro data not found in any location, note no distro data found | |
store_data( | |
{ | |
"image": image, | |
"name": "no-distro-data-found", | |
"id": "no-distro-data-found", | |
"id_like": "no-distro-data-found", | |
} | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment