Skip to content

Instantly share code, notes, and snippets.

@haideralipunjabi
Created June 15, 2020 02:30
Show Gist options
  • Save haideralipunjabi/1eefbbb0b3aaf500d92079ab84d36d1e to your computer and use it in GitHub Desktop.
Save haideralipunjabi/1eefbbb0b3aaf500d92079ab84d36d1e to your computer and use it in GitHub Desktop.
Python Script to scrape an OpenDirectory
import requests
from bs4 import BeautifulSoup as soup
import os
from progress.bar import Bar
import wget
from urllib.parse import unquote
base = "http://www.meeshdesigns.com/Western%20Fonts/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
s = soup(requests.get(base,headers=headers).text)
folders = [base+x.attrs["href"] for x in s.find_all("a")[5:]]
links = []
bar = Bar("Progress: ",max=len(folders))
count = 0
while count < len(folders):
folder = folders[count]
try:
fs = soup(requests.get(folder,headers=headers).text)
for link in fs.find_all("a")[5:]:
if "." in link.attrs["href"]:
links.append(folder+link.attrs["href"])
else:
folders.append(folder+link.attrs["href"])
count += 1
bar.next()
except:
pass
bar = Bar("Progress: ",max=len(links))
for link in links:
fldr, fl = tuple([unquote(x) for x in link.replace(base,"").rsplit("/",1)])
ext = fl.split(".")[1]
if ext in ["pdf","db","html"]:
continue
os.system(f"mkdir -p 'fonts/{fldr}'")
os.system(f"wget -q --tries=inf {link} -P 'fonts/{fldr}/' ")
bar.next()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment