ebenp/extract_href

## extract_href
from lxml import html
import requests
from lxml.cssselect import CSSSelector
from urllib.parse import urljoin

# checks to see if status code is less than or equal to 400
def url_ok(url):
    '''
    Function to get a webpage HEAD and return True if the status code is less
    than or equal to 400
    References
    https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python
    '''
    r = requests.head(url)
    return r.status_code <= 400

def extract_href(url,element_selector, filepath = None):
    '''
    Function to parse urls from the html of a given url
    url: input url that is read as html
    element_selector: css element selector
    filepath: (optional) filepath of out filw

    returns valid urls
    References
    https://stackoverflow.com/questions/8656707/python-css-selector-to-use-inside-lxml-cssselect
    http://pwp.stevecassidy.net/dataweb/crawling.html
    '''

    page = requests.get(url).text
    doc = html.fromstring(page)
    html_select = CSSSelector(element_selector)
    select = CSSSelector(element_selector+'[href]')
    # parsed href links
    parsed_links = [element.get('href') for element in select(doc)]
    # list to hold absolute urls
    links=[]
    # build list of links converting to absolute urls if needed
    for newurl in parsed_links:
        if newurl.startswith('/'):
            newurl = urljoin(url, newurl)
            # ignore any URL that doesn't now start with http
        if newurl.startswith('http'):
            links.append(newurl)
    # list to hold valid urls
    valid_l=[]
    # for each unique url try to get the head and check the status euqal to 200.
    for item in set(links):
        if url_ok(item):
            valid_l.append(item)
    # if a filepath is set then print the urls to the file and print
    # statistics to the screen
    if filepath:
        with open(filepath, 'w') as file_handler:
            for item in valid_l:
                # formats items with one per line. Last line in the file is a new line
                file_handler.write("{}\n".format(item))
        # print some url statistics to the screen
        # [Number] had HTML attributes
        # [Number} had an href attribute
        # [Number] were duplicates or not valid absolute url(s)
        # [Number] were valid

        print('{:>10d}'.format(len(html_select(doc))), 'had HTML attributes')
        parsed_links_len = len(parsed_links)
        print('{:>10d}'.format(parsed_links_len), 'had an href attribute')
        print('{:>10d}'.format(parsed_links_len - len(set(links))), \
        'were duplicates or not valid absolute url(s)')
        print('{:>10d}'.format(len(valid_l)), 'were valid')
    else:
        # if not printing to a file than print to the screen
        for item in valid_l:
            print(item)
    return valid_l

if __name__ == "__main__":
    # test case
    url = 'https://www.epa.gov/endangered-species/biological-evaluation-chapters-chlorpyrifos-esa-assessment'
    out_path='out_test.txt'
    links=extract_href(url,'.main-column.clearfix a', filepath=out_path)
    # print output for testing
    '''
    165 had html attributes
    153 had href attributes
    27 were duplicates
    126 were valid
    from go
    165 matched HTML elements
    153 had a href attribute.
    27 were duplicates
    126 were valid
    '''
    print('DONE!')
	from lxml import html
	import requests
	from lxml.cssselect import CSSSelector
	from urllib.parse import urljoin

	# checks to see if status code is less than or equal to 400
	def url_ok(url):
	'''
	Function to get a webpage HEAD and return True if the status code is less
	than or equal to 400
	References
	https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python
	'''
	r = requests.head(url)
	return r.status_code <= 400

	def extract_href(url,element_selector, filepath = None):
	'''
	Function to parse urls from the html of a given url
	url: input url that is read as html
	element_selector: css element selector
	filepath: (optional) filepath of out filw

	returns valid urls
	References
	https://stackoverflow.com/questions/8656707/python-css-selector-to-use-inside-lxml-cssselect
	http://pwp.stevecassidy.net/dataweb/crawling.html
	'''

	page = requests.get(url).text
	doc = html.fromstring(page)
	html_select = CSSSelector(element_selector)
	select = CSSSelector(element_selector+'[href]')
	# parsed href links
	parsed_links = [element.get('href') for element in select(doc)]
	# list to hold absolute urls
	links=[]
	# build list of links converting to absolute urls if needed
	for newurl in parsed_links:
	if newurl.startswith('/'):
	newurl = urljoin(url, newurl)
	# ignore any URL that doesn't now start with http
	if newurl.startswith('http'):
	links.append(newurl)
	# list to hold valid urls
	valid_l=[]
	# for each unique url try to get the head and check the status euqal to 200.
	for item in set(links):
	if url_ok(item):
	valid_l.append(item)
	# if a filepath is set then print the urls to the file and print
	# statistics to the screen
	if filepath:
	with open(filepath, 'w') as file_handler:
	for item in valid_l:
	# formats items with one per line. Last line in the file is a new line
	file_handler.write("{}\n".format(item))
	# print some url statistics to the screen
	# [Number] had HTML attributes
	# [Number} had an href attribute
	# [Number] were duplicates or not valid absolute url(s)
	# [Number] were valid

	print('{:>10d}'.format(len(html_select(doc))), 'had HTML attributes')
	parsed_links_len = len(parsed_links)
	print('{:>10d}'.format(parsed_links_len), 'had an href attribute')
	print('{:>10d}'.format(parsed_links_len - len(set(links))), \
	'were duplicates or not valid absolute url(s)')
	print('{:>10d}'.format(len(valid_l)), 'were valid')
	else:
	# if not printing to a file than print to the screen
	for item in valid_l:
	print(item)
	return valid_l

	if __name__ == "__main__":
	# test case
	url = 'https://www.epa.gov/endangered-species/biological-evaluation-chapters-chlorpyrifos-esa-assessment'
	out_path='out_test.txt'
	links=extract_href(url,'.main-column.clearfix a', filepath=out_path)
	# print output for testing
	'''
	165 had html attributes
	153 had href attributes
	27 were duplicates
	126 were valid
	from go
	165 matched HTML elements
	153 had a href attribute.
	27 were duplicates
	126 were valid
	'''
	print('DONE!')