Skip to content

Instantly share code, notes, and snippets.

@ebraminio
Last active July 17, 2020 15:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ebraminio/e66885524c4021df1e41b3b354ab3d8c to your computer and use it in GitHub Desktop.
Save ebraminio/e66885524c4021df1e41b3b354ab3d8c to your computer and use it in GitHub Desktop.
files
from collections import deque
import bs4
import requests
def extract_files(site_name):
dirs = deque('/')
result = []
while len(dirs):
path = dirs.pop()
page = bs4.BeautifulSoup(requests.get(site_name + path).text, 'html.parser')
for link in page.find_all('a'):
href = link['href']
assert not href.startswith('/')
if href.startswith('..'): continue
if href.endswith('/'): dirs.append(path + href)
else: result.append(path + href)
print(path)
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment