Last active
December 29, 2017 19:20
-
-
Save ebenp/900cea9b3f3c3b1c747667e831303555 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import requests | |
from lxml.cssselect import CSSSelector | |
from urllib.parse import urljoin | |
# checks to see if status code is less than or equal to 400 | |
def url_ok(url): | |
''' | |
Function to get a webpage HEAD and return True if the status code is less | |
than or equal to 400 | |
References | |
https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python | |
''' | |
r = requests.head(url) | |
return r.status_code <= 400 | |
def extract_href(url,element_selector, filepath = None): | |
''' | |
Function to parse urls from the html of a given url | |
url: input url that is read as html | |
element_selector: css element selector | |
filepath: (optional) filepath of out filw | |
returns valid urls | |
References | |
https://stackoverflow.com/questions/8656707/python-css-selector-to-use-inside-lxml-cssselect | |
http://pwp.stevecassidy.net/dataweb/crawling.html | |
''' | |
page = requests.get(url).text | |
doc = html.fromstring(page) | |
html_select = CSSSelector(element_selector) | |
select = CSSSelector(element_selector+'[href]') | |
# parsed href links | |
parsed_links = [element.get('href') for element in select(doc)] | |
# list to hold absolute urls | |
links=[] | |
# build list of links converting to absolute urls if needed | |
for newurl in parsed_links: | |
if newurl.startswith('/'): | |
newurl = urljoin(url, newurl) | |
# ignore any URL that doesn't now start with http | |
if newurl.startswith('http'): | |
links.append(newurl) | |
# list to hold valid urls | |
valid_l=[] | |
# for each unique url try to get the head and check the status euqal to 200. | |
for item in set(links): | |
if url_ok(item): | |
valid_l.append(item) | |
# if a filepath is set then print the urls to the file and print | |
# statistics to the screen | |
if filepath: | |
with open(filepath, 'w') as file_handler: | |
for item in valid_l: | |
# formats items with one per line. Last line in the file is a new line | |
file_handler.write("{}\n".format(item)) | |
# print some url statistics to the screen | |
# [Number] had HTML attributes | |
# [Number} had an href attribute | |
# [Number] were duplicates or not valid absolute url(s) | |
# [Number] were valid | |
print('{:>10d}'.format(len(html_select(doc))), 'had HTML attributes') | |
parsed_links_len = len(parsed_links) | |
print('{:>10d}'.format(parsed_links_len), 'had an href attribute') | |
print('{:>10d}'.format(parsed_links_len - len(set(links))), \ | |
'were duplicates or not valid absolute url(s)') | |
print('{:>10d}'.format(len(valid_l)), 'were valid') | |
else: | |
# if not printing to a file than print to the screen | |
for item in valid_l: | |
print(item) | |
return valid_l | |
if __name__ == "__main__": | |
# test case | |
url = 'https://www.epa.gov/endangered-species/biological-evaluation-chapters-chlorpyrifos-esa-assessment' | |
out_path='out_test.txt' | |
links=extract_href(url,'.main-column.clearfix a', filepath=out_path) | |
# print output for testing | |
''' | |
165 had html attributes | |
153 had href attributes | |
27 were duplicates | |
126 were valid | |
from go | |
165 matched HTML elements | |
153 had a href attribute. | |
27 were duplicates | |
126 were valid | |
''' | |
print('DONE!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment