Skip to content

Instantly share code, notes, and snippets.

@falkben
Last active November 4, 2017 03:23
Show Gist options
  • Save falkben/1b2e13b008e4dc9471d237d117e43f62 to your computer and use it in GitHub Desktop.
Save falkben/1b2e13b008e4dc9471d237d117e43f62 to your computer and use it in GitHub Desktop.
scrapes a website for all the URLs below it
''' scrapes a website for urls '''
import requests
from bs4 import BeautifulSoup
class URLTest():
def __init__(self, link, status_code, current_depth, head):
self.link = link
self.status_code = status_code
self.current_depth = current_depth
self.head = head
def write_to_csv(filename, link, status, cur_depth, head):
''' actually tab separated '''
with open(filename, 'a') as f:
f.write('{}\t{}\t{}\t{}\n'.format(link, status, cur_depth, head))
def url_search(link, depth_limit=3, current_depth=0, head='', search_internal=[], except_strings=[]):
'''
recursively searches for links in an html tree
'''
print(link, current_depth)
# excluding links with particular strings in them
if except_strings:
ee = [e for e in except_strings if e in link]
if ee:
return
# check if in tested links, if so, grab that status result...
u_prev = list(filter(lambda u: u.link == link, URLS_TESTED))
u_followed = [
u if 'not followed' != u.status_code else None for u in u_prev]
u_followed = [x for x in u_followed if x is not None]
if u_followed:
depths = [u.current_depth for u in u_followed]
s_codes = [u.status_code for u in u_followed]
min_depth = min(depths)
idx = depths.index(min_depth)
if current_depth >= min_depth:
URLS_TESTED.append(
URLTest(link, s_codes[idx], current_depth, head))
# URLS_TESTED.append(
# URLTest(link, 'prev_found', current_depth, head))
return
if search_internal:
ee = [e for e in search_internal if e in link]
if not ee:
URLS_TESTED.append(
URLTest(link, 'not followed', current_depth, head))
return
try:
r = requests.get(link, timeout=2)
if r.status_code != 200:
r = requests.get(link + '/', timeout=2)
except Exception as e:
URLS_TESTED.append(
URLTest(link, 'Error: ' + str(e), current_depth, head))
print(e)
return
URLS_TESTED.append(
URLTest(link, r.status_code, current_depth, head))
if r.status_code != 200:
return
if current_depth + 1 > depth_limit:
return
# now going to iterate through it's links:
soup = BeautifulSoup(r.text, "lxml")
atags = soup.find_all('a')
links = [a.get('href') for a in atags]
links = set(links)
# need this to skip the onhover links or any link that doesn't have href in it
links = [x for x in links if x is not None]
for href in links:
# relative link:
if 'http' not in href:
href = href.lstrip('/')
link_array = link.split('/')
href = '{}//{}/{}'.format(link_array[0], link_array[2], href)
url_search(href, depth_limit=depth_limit, current_depth=current_depth + 1, head=link,
search_internal=search_internal, except_strings=except_strings)
URLS_TESTED = []
def main():
depth_limit = 2
url_type = 'data'
base_url = 'https://neurodata.io/{}/'.format(url_type)
# tab delimited text file (some of the links have commas)
filename = 'links_{}.txt'.format(url_type).replace('/', '_')
# skip links that have
except_strings = ['mendeley', 'mailto', '.pdf', '.tar.gz', '.zip']
# search_internal = ['neurodata.io', 'openconnecto.me', 'github.com']
search_internal = [] # empty set will search all links
# base_links
r = requests.get(base_url)
soup = BeautifulSoup(r.content, "lxml")
for link in soup.find_all('a'):
print(link.get('href'))
print('')
print('')
# iterating through all the links, printing the good and bad ones
url_search(base_url, depth_limit=depth_limit,
search_internal=search_internal, except_strings=except_strings)
URLS_TESTED.sort(key=lambda x: (
x.current_depth, x.head, str(x.status_code), x.link))
with open(filename, 'w') as f:
f.write('link\tstatus\tdepth\thead\n')
for u in URLS_TESTED:
write_to_csv(filename, u.link, u.status_code, u.current_depth, u.head)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment