Skip to content

Instantly share code, notes, and snippets.

@gingeleski
Created May 9, 2018 02:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gingeleski/d9e66e18bfa8e5ca6f5f475743911c60 to your computer and use it in GitHub Desktop.
Save gingeleski/d9e66e18bfa8e5ca6f5f475743911c60 to your computer and use it in GitHub Desktop.
Download all cached Danger & Play posts. Dependencies: bs4, requests
from bs4 import BeautifulSoup
import requests
def get_all_links(r):
all_links = []
soup = BeautifulSoup(r.content,'lxml')
for url in soup.findAll('loc'):
all_links.append(url.string)
return all_links
def crawl_and_save(target_url, current_depth=1, max_depth=2):
r = requests.get(target_url)
if current_depth < max_depth:
current_depth += 1
links = get_all_links(r)
for link in links:
print(link)
crawl_and_save(link,current_depth)
else:
filename = target_url.split('/')[-2] + '.html'
with open(filename, mode='wb') as file:
file.write(r.content)
if __name__ == '__main__':
crawl_and_save('http://www2.dangerandplay.com/post-sitemap1.xml')
crawl_and_save('http://www2.dangerandplay.com/post-sitemap2.xml')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment