Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Get the list of used files starting from given index.html
from bs4 import BeautifoulSoup
def get_linked_files(filename, visited=set()):
visited.add(filename)
try:
soup = BeautifulSoup(open(filename).read(), 'html.parser')
tovisit = set([ x.attrs['href'] for x in soup.find_all('a')])
imgs = set([ x.attrs['src'] for x in soup.find_all('img')])
scripts = set([ x.attrs.get('src','') for x in soup.find_all('script')])
links = set([ x.attrs['href'] for x in soup.find_all('link')])
visited.update(imgs)
visited.update(scripts)
visited.update(links)
tovisit -= visited
except IOError:
tovisit = set()
for i in tovisit:
get_linked_htmls(i, visited)
return visited
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.