Skip to content

Instantly share code, notes, and snippets.

@PeyaPeyaPeyang
Created February 10, 2022 23:48
Show Gist options
  • Save PeyaPeyaPeyang/e3052caf5974d92973018c8299f62d42 to your computer and use it in GitHub Desktop.
Save PeyaPeyaPeyang/e3052caf5974d92973018c8299f62d42 to your computer and use it in GitHub Desktop.
Maven repository crawler
import requests
import os
import xml.etree.ElementTree as et
RESULT_DIR = "result/"
REPO_META_ROOT_URL = "YOUR REPO HERE" + "/"
LOCAL_UP = False
def check_file_exists(url):
r = requests.head(url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/80.0.3987.132 Safari/537.36"
})
r.close()
return r.status_code == 200
def download_file(url, filename=None):
if filename is None:
filename = url.split('/')[-1]
r = requests.get(url, stream=True, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/80.0.3987.132 Safari/537.36"
})
if r.status_code != 200:
return r.status_code
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return filename
def parse_xml(filename):
tree = et.parse(filename)
root = tree.getroot()
return root
def parse_metadata(metadata):
artifact_id = None
group_id = None
versions = []
if metadata.tag != "metadata":
print("Invalid metadata")
return
for attr in metadata:
if attr.tag == "groupId":
group_id = attr.text
elif attr.tag == "artifactId":
artifact_id = attr.text
elif attr.tag == "versioning":
for version_attr in attr:
if version_attr.tag == "versions":
for version_attr_version in version_attr:
if version_attr_version.tag == "version":
versions.append(version_attr_version.text)
if artifact_id is None or group_id is None:
print("Invalid metadata.")
return {
"artifact_id": artifact_id,
"group_id": group_id,
"versions": versions
}
def actually_download(url_base, path_base, name):
print("Downloading " + url_base + name, end="\r")
if os.path.exists(path_base + name):
print("Downloading " + url_base + name + ", skipped.")
return
result = download_file(url_base + name, path_base + name)
if type(result) is int:
print("Downloading " + url_base + name + ", failed: " + str(result))
return False
print("Downloading " + url_base + name + ", done.")
return True
def collect_artifact(group_id, artifact_id, version):
url = REPO_META_ROOT_URL + version + "/"
path_base = RESULT_DIR + group_id.replace(".", "/") + "/" + artifact_id + "/" + version + "/"
os.makedirs(path_base, exist_ok=True)
if not actually_download(url, path_base, "maven-metadata" + ("-local" if LOCAL_UP else "") + ".xml"):
return
if not actually_download(url, path_base, artifact_id + "-" + version + ".jar"):
return
if not actually_download(url, path_base, artifact_id + "-" + version + ".pom"):
return
def download_artifacts(group_id, artifact_id, versions):
for version in versions:
collect_artifact(group_id, artifact_id, version)
def main():
global LOCAL_UP
os.makedirs(RESULT_DIR, exist_ok=True)
if check_file_exists(REPO_META_ROOT_URL + "maven-metadata.xml"):
data = download_file(REPO_META_ROOT_URL + "maven-metadata.xml")
elif check_file_exists(REPO_META_ROOT_URL + "maven-metadata-local.xml"):
LOCAL_UP = True
data = download_file(REPO_META_ROOT_URL + "maven-metadata-local.xml")
else:
print("No metadata found.")
return
data = parse_xml(data)
data = parse_metadata(data)
download_artifacts(data["group_id"], data["artifact_id"], data["versions"])
os.remove(RESULT_DIR + "maven-metadata" + ("-local" if LOCAL_UP else "") + ".xml")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment