falkben/url_scrape.py

## url_scrape.py
''' scrapes a website for urls '''

import requests
from bs4 import BeautifulSoup


class URLTest():
    def __init__(self, link, status_code, current_depth, head):
        self.link = link
        self.status_code = status_code
        self.current_depth = current_depth
        self.head = head


def write_to_csv(filename, link, status, cur_depth, head):
    ''' actually tab separated '''
    with open(filename, 'a') as f:
        f.write('{}\t{}\t{}\t{}\n'.format(link, status, cur_depth, head))


def url_search(link, depth_limit=3, current_depth=0, head='', search_internal=[], except_strings=[]):
    '''
    recursively searches for links in an html tree
    '''
    print(link, current_depth)

    # excluding links with particular strings in them
    if except_strings:
        ee = [e for e in except_strings if e in link]
        if ee:
            return

    # check if in tested links, if so, grab that status result...
    u_prev = list(filter(lambda u: u.link == link, URLS_TESTED))
    u_followed = [
        u if 'not followed' != u.status_code else None for u in u_prev]
    u_followed = [x for x in u_followed if x is not None]
    if u_followed:
        depths = [u.current_depth for u in u_followed]
        s_codes = [u.status_code for u in u_followed]
        min_depth = min(depths)
        idx = depths.index(min_depth)
        if current_depth >= min_depth:
            URLS_TESTED.append(
                URLTest(link, s_codes[idx], current_depth, head))
            # URLS_TESTED.append(
            #     URLTest(link, 'prev_found', current_depth, head))
            return

    if search_internal:
        ee = [e for e in search_internal if e in link]
        if not ee:
            URLS_TESTED.append(
                URLTest(link, 'not followed', current_depth, head))
            return

    try:
        r = requests.get(link, timeout=2)
        if r.status_code != 200:
            r = requests.get(link + '/', timeout=2)
    except Exception as e:
        URLS_TESTED.append(
            URLTest(link, 'Error: ' + str(e), current_depth, head))
        print(e)
        return

    URLS_TESTED.append(
        URLTest(link, r.status_code, current_depth, head))

    if r.status_code != 200:
        return

    if current_depth + 1 > depth_limit:
        return

    # now going to iterate through it's links:
    soup = BeautifulSoup(r.text, "lxml")
    atags = soup.find_all('a')
    links = [a.get('href') for a in atags]
    links = set(links)

    # need this to skip the onhover links or any link that doesn't have href in it
    links = [x for x in links if x is not None]

    for href in links:
        # relative link:
        if 'http' not in href:
            href = href.lstrip('/')
            link_array = link.split('/')
            href = '{}//{}/{}'.format(link_array[0], link_array[2], href)

        url_search(href, depth_limit=depth_limit, current_depth=current_depth + 1, head=link,
                   search_internal=search_internal, except_strings=except_strings)


URLS_TESTED = []


def main():
    depth_limit = 2
    url_type = 'data'
    base_url = 'https://neurodata.io/{}/'.format(url_type)
    # tab delimited text file (some of the links have commas)
    filename = 'links_{}.txt'.format(url_type).replace('/', '_')

    # skip links that have
    except_strings = ['mendeley', 'mailto', '.pdf', '.tar.gz', '.zip']

    # search_internal = ['neurodata.io', 'openconnecto.me', 'github.com']
    search_internal = []  # empty set will search all links

    # base_links
    r = requests.get(base_url)
    soup = BeautifulSoup(r.content, "lxml")
    for link in soup.find_all('a'):
        print(link.get('href'))

    print('')
    print('')

    # iterating through all the links, printing the good and bad ones
    url_search(base_url, depth_limit=depth_limit,
               search_internal=search_internal, except_strings=except_strings)

    URLS_TESTED.sort(key=lambda x: (
        x.current_depth, x.head, str(x.status_code), x.link))

    with open(filename, 'w') as f:
        f.write('link\tstatus\tdepth\thead\n')
    for u in URLS_TESTED:
        write_to_csv(filename, u.link, u.status_code, u.current_depth, u.head)


if __name__ == '__main__':
    main()
	''' scrapes a website for urls '''

	import requests
	from bs4 import BeautifulSoup


	class URLTest():
	def __init__(self, link, status_code, current_depth, head):
	self.link = link
	self.status_code = status_code
	self.current_depth = current_depth
	self.head = head


	def write_to_csv(filename, link, status, cur_depth, head):
	''' actually tab separated '''
	with open(filename, 'a') as f:
	f.write('{}\t{}\t{}\t{}\n'.format(link, status, cur_depth, head))


	def url_search(link, depth_limit=3, current_depth=0, head='', search_internal=[], except_strings=[]):
	'''
	recursively searches for links in an html tree
	'''
	print(link, current_depth)

	# excluding links with particular strings in them
	if except_strings:
	ee = [e for e in except_strings if e in link]
	if ee:
	return

	# check if in tested links, if so, grab that status result...
	u_prev = list(filter(lambda u: u.link == link, URLS_TESTED))
	u_followed = [
	u if 'not followed' != u.status_code else None for u in u_prev]
	u_followed = [x for x in u_followed if x is not None]
	if u_followed:
	depths = [u.current_depth for u in u_followed]
	s_codes = [u.status_code for u in u_followed]
	min_depth = min(depths)
	idx = depths.index(min_depth)
	if current_depth >= min_depth:
	URLS_TESTED.append(
	URLTest(link, s_codes[idx], current_depth, head))
	# URLS_TESTED.append(
	# URLTest(link, 'prev_found', current_depth, head))
	return

	if search_internal:
	ee = [e for e in search_internal if e in link]
	if not ee:
	URLS_TESTED.append(
	URLTest(link, 'not followed', current_depth, head))
	return

	try:
	r = requests.get(link, timeout=2)
	if r.status_code != 200:
	r = requests.get(link + '/', timeout=2)
	except Exception as e:
	URLS_TESTED.append(
	URLTest(link, 'Error: ' + str(e), current_depth, head))
	print(e)
	return

	URLS_TESTED.append(
	URLTest(link, r.status_code, current_depth, head))

	if r.status_code != 200:
	return

	if current_depth + 1 > depth_limit:
	return

	# now going to iterate through it's links:
	soup = BeautifulSoup(r.text, "lxml")
	atags = soup.find_all('a')
	links = [a.get('href') for a in atags]
	links = set(links)

	# need this to skip the onhover links or any link that doesn't have href in it
	links = [x for x in links if x is not None]

	for href in links:
	# relative link:
	if 'http' not in href:
	href = href.lstrip('/')
	link_array = link.split('/')
	href = '{}//{}/{}'.format(link_array[0], link_array[2], href)

	url_search(href, depth_limit=depth_limit, current_depth=current_depth + 1, head=link,
	search_internal=search_internal, except_strings=except_strings)


	URLS_TESTED = []


	def main():
	depth_limit = 2
	url_type = 'data'
	base_url = 'https://neurodata.io/{}/'.format(url_type)
	# tab delimited text file (some of the links have commas)
	filename = 'links_{}.txt'.format(url_type).replace('/', '_')

	# skip links that have
	except_strings = ['mendeley', 'mailto', '.pdf', '.tar.gz', '.zip']

	# search_internal = ['neurodata.io', 'openconnecto.me', 'github.com']
	search_internal = [] # empty set will search all links

	# base_links
	r = requests.get(base_url)
	soup = BeautifulSoup(r.content, "lxml")
	for link in soup.find_all('a'):
	print(link.get('href'))

	print('')
	print('')

	# iterating through all the links, printing the good and bad ones
	url_search(base_url, depth_limit=depth_limit,
	search_internal=search_internal, except_strings=except_strings)

	URLS_TESTED.sort(key=lambda x: (
	x.current_depth, x.head, str(x.status_code), x.link))

	with open(filename, 'w') as f:
	f.write('link\tstatus\tdepth\thead\n')
	for u in URLS_TESTED:
	write_to_csv(filename, u.link, u.status_code, u.current_depth, u.head)


	if __name__ == '__main__':
	main()