ryansmccoy/redis_export_items_url_html_find_career.py

## redis_export_items_url_html_find_career.py
import os, sys
from bs4 import BeautifulSoup
import lxml
from pyquery import PyQuery as pq
import tldextract
import html5lib

import redis

r = redis.StrictRedis(
                    host='localhost',
                    port=32768,
                    db=0
                    )

#### mess around with first item in redis db0
# example redis database key:value i.e.:
# "http://www.12thstreetasset.com/":"<!DOCTYPE html PUBLIC "-//W3C//DT"....

item_key_url = r.keys()[0].decode('utf-8')
item_value_html = r.get(r.keys()[0]).decode('utf-8')
item_soup = BeautifulSoup(item_value_html, 'lxml')
item_links = item_soup.find_all('a')

####  this for loop creates both list of dictionaries and individiual files from redis db0

output_folder = r'c:\html'
output_extention = '.html'

list_of_soup = []

for i in r.keys():
    website_url = i.decode('utf-8')

    tld = tldextract.extract(website_url)
    # ExtractResult(subdomain='www', domain='12thstreetasset', suffix='com')

    soup = BeautifulSoup(r.get(i.decode('utf-8')), 'html5lib')
    # using html5lib because more forgiving

    p = pq(soup.prettify())
    ab = p.make_links_absolute(base_url=website_url)
    # turns links like /careers into links like http://www.12thstreetasset.com/careers

    list_of_soup.append({tld.domain:ab.html()})
    # creates dictionary and adds it to list call list_of_soup
    # with open(os.path.join(output_folder, tld.domain + output_extention),'w', encoding='utf-8') as f:
    #    print(soup.prettify(), file=f)
    # save files in folder

#### list_of_soup = [{"http://www.12thstreetasset.com/":"<!DOCTYPE html PUBLIC "-//W3C//DT...."},{...}]

#### this for loop goes through list of dictionaries and searches for career url in html
links = []
for i in list_of_soup:
    if "career" in str(dict(i).values()):
    # if word "career" in html then continue
        html = str(list(i.values())[0])
        # convert html to string object
        soup = BeautifulSoup(html, 'lxml').find_all('a')
        # find all urls using soup
        for j in soup:
        # loop through all urls and append any with word career in it
            try:
                if 'career' in str(j['href']):
                    links.append(str(j['href']))
            except:
                pass

nodupes = list(set(links))

# ['http://www.guggenheimpartners.com/firm/careers', 'http://www.penncapital.com/about/careers', 'https://corporate.waddell.com/careers', ...
	import os, sys
	from bs4 import BeautifulSoup
	import lxml
	from pyquery import PyQuery as pq
	import tldextract
	import html5lib

	import redis

	r = redis.StrictRedis(
	host='localhost',
	port=32768,
	db=0
	)

	#### mess around with first item in redis db0
	# example redis database key:value i.e.:
	# "http://www.12thstreetasset.com/":"<!DOCTYPE html PUBLIC "-//W3C//DT"....

	item_key_url = r.keys()[0].decode('utf-8')
	item_value_html = r.get(r.keys()[0]).decode('utf-8')
	item_soup = BeautifulSoup(item_value_html, 'lxml')
	item_links = item_soup.find_all('a')

	#### this for loop creates both list of dictionaries and individiual files from redis db0

	output_folder = r'c:\html'
	output_extention = '.html'

	list_of_soup = []

	for i in r.keys():
	website_url = i.decode('utf-8')

	tld = tldextract.extract(website_url)
	# ExtractResult(subdomain='www', domain='12thstreetasset', suffix='com')

	soup = BeautifulSoup(r.get(i.decode('utf-8')), 'html5lib')
	# using html5lib because more forgiving

	p = pq(soup.prettify())
	ab = p.make_links_absolute(base_url=website_url)
	# turns links like /careers into links like http://www.12thstreetasset.com/careers

	list_of_soup.append({tld.domain:ab.html()})
	# creates dictionary and adds it to list call list_of_soup
	# with open(os.path.join(output_folder, tld.domain + output_extention),'w', encoding='utf-8') as f:
	# print(soup.prettify(), file=f)
	# save files in folder

	#### list_of_soup = [{"http://www.12thstreetasset.com/":"<!DOCTYPE html PUBLIC "-//W3C//DT...."},{...}]

	#### this for loop goes through list of dictionaries and searches for career url in html
	links = []
	for i in list_of_soup:
	if "career" in str(dict(i).values()):
	# if word "career" in html then continue
	html = str(list(i.values())[0])
	# convert html to string object
	soup = BeautifulSoup(html, 'lxml').find_all('a')
	# find all urls using soup
	for j in soup:
	# loop through all urls and append any with word career in it
	try:
	if 'career' in str(j['href']):
	links.append(str(j['href']))
	except:
	pass

	nodupes = list(set(links))

	# ['http://www.guggenheimpartners.com/firm/careers', 'http://www.penncapital.com/about/careers', 'https://corporate.waddell.com/careers', ...