Skip to content

Instantly share code, notes, and snippets.

@ryansmccoy
Last active December 28, 2016 01:00
Show Gist options
  • Save ryansmccoy/4aeab6b468724c7ef5cb471aa5ffb578 to your computer and use it in GitHub Desktop.
Save ryansmccoy/4aeab6b468724c7ef5cb471aa5ffb578 to your computer and use it in GitHub Desktop.
searching homepages for career urls
import os, sys
from bs4 import BeautifulSoup
import lxml
from pyquery import PyQuery as pq
import tldextract
import html5lib
import redis
r = redis.StrictRedis(
host='localhost',
port=32768,
db=0
)
#### mess around with first item in redis db0
# example redis database key:value i.e.:
# "http://www.12thstreetasset.com/":"<!DOCTYPE html PUBLIC "-//W3C//DT"....
item_key_url = r.keys()[0].decode('utf-8')
item_value_html = r.get(r.keys()[0]).decode('utf-8')
item_soup = BeautifulSoup(item_value_html, 'lxml')
item_links = item_soup.find_all('a')
#### this for loop creates both list of dictionaries and individiual files from redis db0
output_folder = r'c:\html'
output_extention = '.html'
list_of_soup = []
for i in r.keys():
website_url = i.decode('utf-8')
tld = tldextract.extract(website_url)
# ExtractResult(subdomain='www', domain='12thstreetasset', suffix='com')
soup = BeautifulSoup(r.get(i.decode('utf-8')), 'html5lib')
# using html5lib because more forgiving
p = pq(soup.prettify())
ab = p.make_links_absolute(base_url=website_url)
# turns links like /careers into links like http://www.12thstreetasset.com/careers
list_of_soup.append({tld.domain:ab.html()})
# creates dictionary and adds it to list call list_of_soup
# with open(os.path.join(output_folder, tld.domain + output_extention),'w', encoding='utf-8') as f:
# print(soup.prettify(), file=f)
# save files in folder
#### list_of_soup = [{"http://www.12thstreetasset.com/":"<!DOCTYPE html PUBLIC "-//W3C//DT...."},{...}]
#### this for loop goes through list of dictionaries and searches for career url in html
links = []
for i in list_of_soup:
if "career" in str(dict(i).values()):
# if word "career" in html then continue
html = str(list(i.values())[0])
# convert html to string object
soup = BeautifulSoup(html, 'lxml').find_all('a')
# find all urls using soup
for j in soup:
# loop through all urls and append any with word career in it
try:
if 'career' in str(j['href']):
links.append(str(j['href']))
except:
pass
nodupes = list(set(links))
# ['http://www.guggenheimpartners.com/firm/careers', 'http://www.penncapital.com/about/careers', 'https://corporate.waddell.com/careers', ...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment