Created
February 10, 2022 23:48
-
-
Save PeyaPeyaPeyang/e3052caf5974d92973018c8299f62d42 to your computer and use it in GitHub Desktop.
Maven repository crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os | |
import xml.etree.ElementTree as et | |
RESULT_DIR = "result/" | |
REPO_META_ROOT_URL = "YOUR REPO HERE" + "/" | |
LOCAL_UP = False | |
def check_file_exists(url): | |
r = requests.head(url, headers={ | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/80.0.3987.132 Safari/537.36" | |
}) | |
r.close() | |
return r.status_code == 200 | |
def download_file(url, filename=None): | |
if filename is None: | |
filename = url.split('/')[-1] | |
r = requests.get(url, stream=True, headers={ | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/80.0.3987.132 Safari/537.36" | |
}) | |
if r.status_code != 200: | |
return r.status_code | |
with open(filename, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: | |
f.write(chunk) | |
return filename | |
def parse_xml(filename): | |
tree = et.parse(filename) | |
root = tree.getroot() | |
return root | |
def parse_metadata(metadata): | |
artifact_id = None | |
group_id = None | |
versions = [] | |
if metadata.tag != "metadata": | |
print("Invalid metadata") | |
return | |
for attr in metadata: | |
if attr.tag == "groupId": | |
group_id = attr.text | |
elif attr.tag == "artifactId": | |
artifact_id = attr.text | |
elif attr.tag == "versioning": | |
for version_attr in attr: | |
if version_attr.tag == "versions": | |
for version_attr_version in version_attr: | |
if version_attr_version.tag == "version": | |
versions.append(version_attr_version.text) | |
if artifact_id is None or group_id is None: | |
print("Invalid metadata.") | |
return { | |
"artifact_id": artifact_id, | |
"group_id": group_id, | |
"versions": versions | |
} | |
def actually_download(url_base, path_base, name): | |
print("Downloading " + url_base + name, end="\r") | |
if os.path.exists(path_base + name): | |
print("Downloading " + url_base + name + ", skipped.") | |
return | |
result = download_file(url_base + name, path_base + name) | |
if type(result) is int: | |
print("Downloading " + url_base + name + ", failed: " + str(result)) | |
return False | |
print("Downloading " + url_base + name + ", done.") | |
return True | |
def collect_artifact(group_id, artifact_id, version): | |
url = REPO_META_ROOT_URL + version + "/" | |
path_base = RESULT_DIR + group_id.replace(".", "/") + "/" + artifact_id + "/" + version + "/" | |
os.makedirs(path_base, exist_ok=True) | |
if not actually_download(url, path_base, "maven-metadata" + ("-local" if LOCAL_UP else "") + ".xml"): | |
return | |
if not actually_download(url, path_base, artifact_id + "-" + version + ".jar"): | |
return | |
if not actually_download(url, path_base, artifact_id + "-" + version + ".pom"): | |
return | |
def download_artifacts(group_id, artifact_id, versions): | |
for version in versions: | |
collect_artifact(group_id, artifact_id, version) | |
def main(): | |
global LOCAL_UP | |
os.makedirs(RESULT_DIR, exist_ok=True) | |
if check_file_exists(REPO_META_ROOT_URL + "maven-metadata.xml"): | |
data = download_file(REPO_META_ROOT_URL + "maven-metadata.xml") | |
elif check_file_exists(REPO_META_ROOT_URL + "maven-metadata-local.xml"): | |
LOCAL_UP = True | |
data = download_file(REPO_META_ROOT_URL + "maven-metadata-local.xml") | |
else: | |
print("No metadata found.") | |
return | |
data = parse_xml(data) | |
data = parse_metadata(data) | |
download_artifacts(data["group_id"], data["artifact_id"], data["versions"]) | |
os.remove(RESULT_DIR + "maven-metadata" + ("-local" if LOCAL_UP else "") + ".xml") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment