Skip to content

Instantly share code, notes, and snippets.

@mastier
Last active February 18, 2016 22:11
Show Gist options
  • Save mastier/54ab23dd0f5be6bc30dd to your computer and use it in GitHub Desktop.
Save mastier/54ab23dd0f5be6bc30dd to your computer and use it in GitHub Desktop.
Get the list of used files starting from given index.html
from bs4 import BeautifoulSoup
def get_linked_files(filename, visited=set()):
visited.add(filename)
try:
soup = BeautifulSoup(open(filename).read(), 'html.parser')
tovisit = set([ x.attrs['href'] for x in soup.find_all('a')])
imgs = set([ x.attrs['src'] for x in soup.find_all('img')])
scripts = set([ x.attrs.get('src','') for x in soup.find_all('script')])
links = set([ x.attrs['href'] for x in soup.find_all('link')])
visited.update(imgs)
visited.update(scripts)
visited.update(links)
tovisit -= visited
except IOError:
tovisit = set()
for i in tovisit:
get_linked_htmls(i, visited)
return visited
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment