Skip to content

Instantly share code, notes, and snippets.

View chris-lovejoy's full-sized avatar

ChrisLovejoy chris-lovejoy

View GitHub Profile
@chris-lovejoy
chris-lovejoy / job_scraper.py
Created May 1, 2020 11:26
Import statements
import urllib
import requests
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import os
@chris-lovejoy
chris-lovejoy / job_scraper.py
Last active May 1, 2020 13:31
initiate driver
def initiate_driver(location_of_driver, browser):
if browser == 'chrome':
driver = webdriver.Chrome(executable_path=(location_of_driver + "/chromedriver"))
elif browser == 'firefox':
driver = webdriver.Firefox(executable_path=(location_of_driver + "/firefoxdriver"))
elif browser == 'safari':
driver = webdriver.Safari(executable_path=(location_of_driver + "/safaridriver"))
elif browser == 'edge':
driver = webdriver.Edge(executable_path=(location_of_driver + "/edgedriver"))
return driver
@chris-lovejoy
chris-lovejoy / video_finder.py
Last active June 17, 2020 15:24
ranking by view-to-subscriber ratio
import pandas as pd
def find_title(item):
title = item['snippet']['title']
return title
def find_viewcount(item, youtube_api):
video_id = item['id']['videoId']
video_statistics = youtube_api.videos().list(id=video_id, part='statistics').execute()
viewcount = int(video_statistics['items'][0]['statistics']['viewCount'])
@chris-lovejoy
chris-lovejoy / video_finder.py
Created June 17, 2020 15:30
including date published in model
from datetime import datetime, timedelta
# creating variable for time one week ago
today_date = datetime.today()
one_week_ago_date = today_date - timedelta(7)
one_week_ago_string = datetime(year=one_week_ago_date.year,month=one_week_ago_date.month,
day=one_week_ago_date.day).strftime('%Y-%m-%dT%H:%M:%SZ')
# updating the search by adding 'publishedAfter'
@chris-lovejoy
chris-lovejoy / job_scraper.py
Last active October 5, 2020 16:15
loading job soup
def load_indeed_jobs_div(job_title, location):
getVars = {'q' : job_title, 'l' : location, 'fromage' : 'last', 'sort' : 'date'}
url = ('https://www.indeed.co.uk/jobs?' + urllib.parse.urlencode(getVars))
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
job_soup = soup.find(id="resultsCol")
return job_soup
@chris-lovejoy
chris-lovejoy / job_scraper.py
Last active October 5, 2020 16:17
job detail extraction
def extract_job_title_indeed(job_elem):
title_elem = job_elem.find('h2', class_='title')
title = title_elem.text.strip()
return title
def extract_company_indeed(job_elem):
company_elem = job_elem.find('span', class_='company')
company = company_elem.text.strip()
return company
@chris-lovejoy
chris-lovejoy / job_scraper.py
Created May 1, 2020 12:08
Finding each job card
job_elems = job_soup.find_all('div', class_='jobsearch-SerpJobCard')
@chris-lovejoy
chris-lovejoy / job_scraper.py
Created May 1, 2020 12:10
Creating lists for each job characteristic
cols = []
extracted_info = []
if 'titles' in desired_characs:
titles = []
cols.append('titles')
for job_elem in job_elems:
titles.append(extract_job_title_indeed(job_elem))
extracted_info.append(titles)
@chris-lovejoy
chris-lovejoy / job_scraper.py
Created May 1, 2020 12:11
Creating the final jobs list
jobs_list = {}
for j in range(len(cols)):
jobs_list[cols[j]] = extracted_info[j]
num_listings = len(extracted_info[0])
@chris-lovejoy
chris-lovejoy / job_scraper.py
Created May 1, 2020 12:28
Exporting information to file
def save_jobs_to_excel(jobs_list, filename):
jobs = pd.DataFrame(jobs_list)
jobs.to_excel(filename)