Skip to content

Instantly share code, notes, and snippets.

@gutzbenj
Last active December 20, 2020 22:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gutzbenj/88e69d10447698d099f7227a731add9b to your computer and use it in GitHub Desktop.
Save gutzbenj/88e69d10447698d099f7227a731add9b to your computer and use it in GitHub Desktop.
Investigate problems when scanning huge directory tree of DWD CDC HTTP server
"""
Investigate problems when scanning huge directory
tree of DWD CDC HTTP server.
This repro will show the actual files as found in the url
python list-remote-files-dwd.py | grep -v zip
"""
import os
import requests
from typing import List
from urllib.parse import urljoin
from bs4 import BeautifulSoup
session = requests.Session()
def list_remote_files(url: str, recursive: bool) -> List[str]:
"""
A function used to create a listing of all files of a given path on the server
Args:
url: the url which should be searched for files
recursive: definition if the function should iteratively list files
from sub folders
Returns:
a list of strings representing the files from the path
"""
if not url.endswith("/"):
url += "/"
r = session.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "lxml")
files_and_folders = [
link.get("href") for link in soup.find_all("a") if link.get("href") != "../"
]
files = []
folders = []
for f in files_and_folders:
if not f.endswith("/"):
files.append(urljoin(url, f))
else:
folders.append(urljoin(url, f))
if recursive:
files_in_folders = [list_remote_files(folder, recursive) for folder in folders]
for files_in_folder in files_in_folders:
files.extend(files_in_folder)
return files
def process(url):
files = list_remote_files(url, True)
for name in files:
print(name)
if __name__ == "__main__":
large_folder = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/1_minute/precipitation/historical/"
url = os.environ.get("DWD_URL", large_folder)
process(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment