Skip to content

Instantly share code, notes, and snippets.

@totetmatt
Last active July 16, 2018 15:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save totetmatt/8be65e6d9b159a42c89181c26c4726fb to your computer and use it in GitHub Desktop.
Save totetmatt/8be65e6d9b159a42c89181c26c4726fb to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import sys
import uuid
import json
data = {
'url':sys.argv[1],
'links': [],
'images': []
}
req = requests.get(data['url'])
soup = BeautifulSoup(req.content,'html5lib')
for a in soup.find_all('a'):
if a.attrs.get('href',None):
data['links'].append(a.get('href'))
for img in soup.find_all('img'):
if img.attrs.get('src',None):
data['images'].append(img.get('src'))
json.dump(data,open(uuid.uuid5(uuid.NAMESPACE_URL, data['url']).hex+'.json','w'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment