Skip to content

Instantly share code, notes, and snippets.

@ebenp
Last active December 29, 2017 19:20
Show Gist options
  • Save ebenp/900cea9b3f3c3b1c747667e831303555 to your computer and use it in GitHub Desktop.
Save ebenp/900cea9b3f3c3b1c747667e831303555 to your computer and use it in GitHub Desktop.
from lxml import html
import requests
from lxml.cssselect import CSSSelector
from urllib.parse import urljoin
# checks to see if status code is less than or equal to 400
def url_ok(url):
'''
Function to get a webpage HEAD and return True if the status code is less
than or equal to 400
References
https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python
'''
r = requests.head(url)
return r.status_code <= 400
def extract_href(url,element_selector, filepath = None):
'''
Function to parse urls from the html of a given url
url: input url that is read as html
element_selector: css element selector
filepath: (optional) filepath of out filw
returns valid urls
References
https://stackoverflow.com/questions/8656707/python-css-selector-to-use-inside-lxml-cssselect
http://pwp.stevecassidy.net/dataweb/crawling.html
'''
page = requests.get(url).text
doc = html.fromstring(page)
html_select = CSSSelector(element_selector)
select = CSSSelector(element_selector+'[href]')
# parsed href links
parsed_links = [element.get('href') for element in select(doc)]
# list to hold absolute urls
links=[]
# build list of links converting to absolute urls if needed
for newurl in parsed_links:
if newurl.startswith('/'):
newurl = urljoin(url, newurl)
# ignore any URL that doesn't now start with http
if newurl.startswith('http'):
links.append(newurl)
# list to hold valid urls
valid_l=[]
# for each unique url try to get the head and check the status euqal to 200.
for item in set(links):
if url_ok(item):
valid_l.append(item)
# if a filepath is set then print the urls to the file and print
# statistics to the screen
if filepath:
with open(filepath, 'w') as file_handler:
for item in valid_l:
# formats items with one per line. Last line in the file is a new line
file_handler.write("{}\n".format(item))
# print some url statistics to the screen
# [Number] had HTML attributes
# [Number} had an href attribute
# [Number] were duplicates or not valid absolute url(s)
# [Number] were valid
print('{:>10d}'.format(len(html_select(doc))), 'had HTML attributes')
parsed_links_len = len(parsed_links)
print('{:>10d}'.format(parsed_links_len), 'had an href attribute')
print('{:>10d}'.format(parsed_links_len - len(set(links))), \
'were duplicates or not valid absolute url(s)')
print('{:>10d}'.format(len(valid_l)), 'were valid')
else:
# if not printing to a file than print to the screen
for item in valid_l:
print(item)
return valid_l
if __name__ == "__main__":
# test case
url = 'https://www.epa.gov/endangered-species/biological-evaluation-chapters-chlorpyrifos-esa-assessment'
out_path='out_test.txt'
links=extract_href(url,'.main-column.clearfix a', filepath=out_path)
# print output for testing
'''
165 had html attributes
153 had href attributes
27 were duplicates
126 were valid
from go
165 matched HTML elements
153 had a href attribute.
27 were duplicates
126 were valid
'''
print('DONE!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment