Last active
December 28, 2016 01:00
-
-
Save ryansmccoy/4aeab6b468724c7ef5cb471aa5ffb578 to your computer and use it in GitHub Desktop.
searching homepages for career urls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys | |
from bs4 import BeautifulSoup | |
import lxml | |
from pyquery import PyQuery as pq | |
import tldextract | |
import html5lib | |
import redis | |
r = redis.StrictRedis( | |
host='localhost', | |
port=32768, | |
db=0 | |
) | |
#### mess around with first item in redis db0 | |
# example redis database key:value i.e.: | |
# "http://www.12thstreetasset.com/":"<!DOCTYPE html PUBLIC "-//W3C//DT".... | |
item_key_url = r.keys()[0].decode('utf-8') | |
item_value_html = r.get(r.keys()[0]).decode('utf-8') | |
item_soup = BeautifulSoup(item_value_html, 'lxml') | |
item_links = item_soup.find_all('a') | |
#### this for loop creates both list of dictionaries and individiual files from redis db0 | |
output_folder = r'c:\html' | |
output_extention = '.html' | |
list_of_soup = [] | |
for i in r.keys(): | |
website_url = i.decode('utf-8') | |
tld = tldextract.extract(website_url) | |
# ExtractResult(subdomain='www', domain='12thstreetasset', suffix='com') | |
soup = BeautifulSoup(r.get(i.decode('utf-8')), 'html5lib') | |
# using html5lib because more forgiving | |
p = pq(soup.prettify()) | |
ab = p.make_links_absolute(base_url=website_url) | |
# turns links like /careers into links like http://www.12thstreetasset.com/careers | |
list_of_soup.append({tld.domain:ab.html()}) | |
# creates dictionary and adds it to list call list_of_soup | |
# with open(os.path.join(output_folder, tld.domain + output_extention),'w', encoding='utf-8') as f: | |
# print(soup.prettify(), file=f) | |
# save files in folder | |
#### list_of_soup = [{"http://www.12thstreetasset.com/":"<!DOCTYPE html PUBLIC "-//W3C//DT...."},{...}] | |
#### this for loop goes through list of dictionaries and searches for career url in html | |
links = [] | |
for i in list_of_soup: | |
if "career" in str(dict(i).values()): | |
# if word "career" in html then continue | |
html = str(list(i.values())[0]) | |
# convert html to string object | |
soup = BeautifulSoup(html, 'lxml').find_all('a') | |
# find all urls using soup | |
for j in soup: | |
# loop through all urls and append any with word career in it | |
try: | |
if 'career' in str(j['href']): | |
links.append(str(j['href'])) | |
except: | |
pass | |
nodupes = list(set(links)) | |
# ['http://www.guggenheimpartners.com/firm/careers', 'http://www.penncapital.com/about/careers', 'https://corporate.waddell.com/careers', ... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment