enriched/get_rpm_metadata.py

## get_rpm_metadata.py
import os
from urllib.parse import urljoin
import urllib
import gzip
from pathlib import Path, PurePath
import requests
from bs4 import BeautifulSoup

NOTEBOOK_DIR = os.getcwd()
REPODATA_DIR = Path(NOTEBOOK_DIR) / "repodata"
REPOCACHE_DIR = REPODATA_DIR / "cache"
if not REPODATA_DIR.exists():
    os.mkdir(REPODATA_DIR)
if not REPOCACHE_DIR.exists():
    os.mkdir(REPOCACHE_DIR)

BASEARCH = "x86_64"
CENTOS_MIRROR_URL = "http://mirror.centos.org/centos/"
CENTOS7_BASEOS_URL = urljoin(CENTOS_MIRROR_URL, f"7/os/{BASEARCH}/")

# Get repomd.xml
repomd_request = requests.get(f"{CENTOS7_BASEOS_URL}/repodata/repomd.xml")
repomd_content = repomd_request.content
repomd_path = REPODATA_DIR / "repomd.xml"
repomd_path.write_bytes(repomd_content)
repomd = BeautifulSoup(repomd_request.content, "xml")


def get_metadata_file(type: str) -> PurePath:
    metadata_file_href = repomd.find_all("data", type=type)[0].location["href"]
    metadata_filename: str = os.path.basename(metadata_file_href)
    metadata_file_url = urllib.parse.urljoin(CENTOS7_BASEOS_URL, metadata_file_href)
    request = requests.get(metadata_file_url)
    metadata_file_path = REPODATA_DIR / metadata_filename
    metadata_file_path.write_bytes(request.content)

    if metadata_file_path.suffix == ".gz":
        metadata_file_decomp_path = metadata_file_path.with_suffix("")
        metadata_file_decomp_path.write_bytes(gzip.decompress(request.content))

    return metadata_file_path


# Get primary.xml
primary_file_path = get_metadata_file("primary")
# Get filelists.xml
filelists_file_path = get_metadata_file("filelists")
	import os
	from urllib.parse import urljoin
	import urllib
	import gzip
	from pathlib import Path, PurePath
	import requests
	from bs4 import BeautifulSoup

	NOTEBOOK_DIR = os.getcwd()
	REPODATA_DIR = Path(NOTEBOOK_DIR) / "repodata"
	REPOCACHE_DIR = REPODATA_DIR / "cache"
	if not REPODATA_DIR.exists():
	os.mkdir(REPODATA_DIR)
	if not REPOCACHE_DIR.exists():
	os.mkdir(REPOCACHE_DIR)

	BASEARCH = "x86_64"
	CENTOS_MIRROR_URL = "http://mirror.centos.org/centos/"
	CENTOS7_BASEOS_URL = urljoin(CENTOS_MIRROR_URL, f"7/os/{BASEARCH}/")

	# Get repomd.xml
	repomd_request = requests.get(f"{CENTOS7_BASEOS_URL}/repodata/repomd.xml")
	repomd_content = repomd_request.content
	repomd_path = REPODATA_DIR / "repomd.xml"
	repomd_path.write_bytes(repomd_content)
	repomd = BeautifulSoup(repomd_request.content, "xml")


	def get_metadata_file(type: str) -> PurePath:
	metadata_file_href = repomd.find_all("data", type=type)[0].location["href"]
	metadata_filename: str = os.path.basename(metadata_file_href)
	metadata_file_url = urllib.parse.urljoin(CENTOS7_BASEOS_URL, metadata_file_href)
	request = requests.get(metadata_file_url)
	metadata_file_path = REPODATA_DIR / metadata_filename
	metadata_file_path.write_bytes(request.content)

	if metadata_file_path.suffix == ".gz":
	metadata_file_decomp_path = metadata_file_path.with_suffix("")
	metadata_file_decomp_path.write_bytes(gzip.decompress(request.content))

	return metadata_file_path


	# Get primary.xml
	primary_file_path = get_metadata_file("primary")
	# Get filelists.xml
	filelists_file_path = get_metadata_file("filelists")