Skip to content

Instantly share code, notes, and snippets.

@keineahnung2345
Last active November 4, 2020 08:21
Show Gist options
  • Save keineahnung2345/4d448166257b446eeed584d0db8f1c6d to your computer and use it in GitHub Desktop.
Save keineahnung2345/4d448166257b446eeed584d0db8f1c6d to your computer and use it in GitHub Desktop.
This script can download a folder(recursively) from sourceforge.net
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import wget #for downloading files
# convert url to normal string
from urllib.parse import unquote
import os
url_base = "https://sourceforge.net/"
url_start = "https://sourceforge.net/projects/pointclouds/files/PCD%20datasets/"
def download_folder(url):
if url[-1] == '/':
url = url[:-1]
folder = unquote(url.split('/')[-1])
if not os.path.exists(folder):
os.mkdir(folder)
os.chdir(folder)
print("go to", folder)
response = requests.get(url)
soup = BeautifulSoup(response.text, features="lxml")
items = soup.findAll("span", attrs={"class" : "name"})
items = [item.parent.get("href").replace("/download", "") for item in items]
print(len(items))
files = []
nexturls = []
for item in items:
if not item.endswith("/"):
files.append(item)
else:
nexturls.append(url_base + item)
for file in files:
print("download ", file)
# wget.download(file)
"""
need to install the latest windows wget from
https://eternallybored.org/misc/wget/
"""
os.system("wget --continue " + file)
for nexturl in nexturls:
download_folder(nexturl)
os.chdir("..")
if __name__ == "__main__":
download_folder(url_start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment