stkbailey/selenium-pilot-scraper.ipynb

## selenium-pilot-scraper.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overview\n",
    "\n",
    "The \"Google for Jobs\" interface has a list of jobs on the left side of the screen, and a larger panel on the right side with full information about each job. We will perform a nightly scrape of the website, in which we filter for the past day's worth of postings. \n",
    "\n",
    "I used this [google-jobs-scraper](https://github.com/AliMahmoud7/google-jobs-scraper) repo as inspiration, but it is designed to scrape the Google Careers page, not the Google For Jobs page.\n",
    "\n",
    "#### Approach\n",
    "\n",
    "1. Select the URL from which to scrape using a manual query. Apply all necessary filters.\n",
    "    - \"Data scientist\", \"data analyst\" or \"data engineer\"\n",
    "    - Within 60 miles of Nashville, TN\n",
    "    - Posted in the last day\n",
    "2. Use Selenium to click on each of the postings on the left. After clicking on each posting, refresh the list of available posts, since they do not all appear on the first click. \n",
    "3. For each clicked posting, parse the larger text on the right to get the available information. For many of these, the \"Read More\" button will need to be clicked.\n",
    "    - DatePosted\n",
    "    - JobTitle\n",
    "    - Company\n",
    "    - Location\n",
    "    - HostSites\n",
    "    - JobType (Full / Part / Internship)\n",
    "    - JobDescription\n",
    "    - JobWebsite (Google + Website)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.by import By\n",
    "from selenium.webdriver.support.ui import WebDriverWait\n",
    "from selenium.webdriver.support import expected_conditions as EC\n",
    "from selenium.common.exceptions import TimeoutException\n",
    "from bs4 import BeautifulSoup\n",
    "import json\n",
    "import time\n",
    "\n",
    "def prepare_url(url=None):\n",
    "    default_url = 'https://www.google.com/search?q=data+science+jobs+nashville&ibp=htl;jobs&t=sq&li=20&st=0&jlo=all&t=sq&li=20&st=0&jlo=all#fpstate=tldetail&htichips=city:PZDrEzLsZIig2umh0Lk_fQ%3D%3D,job_family_1:data%20scientist,job_family_1:data%20analyst,job_family_1:data%20engineer,date_posted:today&htidocid=6WPSbX_Rgg6Q5dNoAAAAAA%3D%3D&htilrad=96.5604&htischips=city;PZDrEzLsZIig2umh0Lk_fQ%3D%3D:Nashville_comma_%20TN,job_family_1;data%20scientist;data%20analyst;data%20engineer,date_posted;today&htivrt=jobs&st=0&t=sq'\n",
    "    if url is None:\n",
    "        url = default_url    \n",
    "    params = {\n",
    "        't': 'sq',\n",
    "        'li': '20',\n",
    "        'st': '0',\n",
    "        'jlo': 'all'\n",
    "    }\n",
    "    result = requests.get(url=url, params=params)\n",
    "    prep_url = result.url\n",
    "    return url\n",
    "\n",
    "def prepare_browser():\n",
    "    # Create Web Driver\n",
    "    chrome_options = webdriver.ChromeOptions()\n",
    "#     chrome_options.add_argument('headless')\n",
    "\n",
    "    # Creates and open a new instance of the chrome driver\n",
    "    path_to_chromedriver = './chromedriver.exe'\n",
    "    browser = webdriver.Chrome(path_to_chromedriver,\n",
    "                               chrome_options=chrome_options)\n",
    "    return browser\n",
    "\n",
    "\n",
    "def parse_job(html):\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "\n",
    "    data = {}\n",
    "    data['JobTitle'] = soup.select_one('div.fheAoc.nsol9b.RgAZAc').text\n",
    "\n",
    "    header = soup.select('div.nsol9b.k8RiQ')\n",
    "    try: \n",
    "        data['Company'] = header[0].text\n",
    "        data['City'] = header[1].text\n",
    "    except IndexError:\n",
    "        data['Company'] = None\n",
    "        data['City'] = None\n",
    "        \n",
    "    tags = soup.select('div.Ug1maf.BbiuWb') \n",
    "    try:\n",
    "        data['PostDate'] = tags[0].text\n",
    "        data['JobType'] = tags[1].text\n",
    "    except IndexError:\n",
    "        data['PostDate'] = None\n",
    "        data['JobType'] = None\n",
    "\n",
    "    try:\n",
    "        data['JobDescription'] = soup.select_one('div.auGhcd').text\n",
    "    except AttributeError:\n",
    "        data['JobDescription'] = None\n",
    "    \n",
    "    return data\n",
    "\n",
    "\n",
    "def scrape_google_for_jobs(url=None, max_postings=30):\n",
    "    url = prepare_url(url)\n",
    "    browser = prepare_browser()\n",
    "    \n",
    "    # Open first URL\n",
    "    browser.get(url)\n",
    "\n",
    "    # Wait or sleep until all page data loaded\n",
    "    WebDriverWait(browser, 10).until(\n",
    "        EC.presence_of_all_elements_located((By.CLASS_NAME, 'PaEvOc'))\n",
    "    )\n",
    "\n",
    "    # Get initial page body\n",
    "    body = browser.page_source\n",
    "\n",
    "    # Parse page html to extract jobs info\n",
    "    soup = BeautifulSoup(body, 'html.parser')\n",
    "    postings = browser.find_elements_by_css_selector('div.PaEvOc')\n",
    "\n",
    "    total = len(postings)\n",
    "    x = 0\n",
    "    postings_data = []\n",
    "    while (x < total) and (x < max_postings):\n",
    "        # Click on the next posting and expand the description\n",
    "        postings[x].click()\n",
    "        time.sleep(0.5)\n",
    "        read_button = browser.find_element_by_css_selector('div.CdXzFe.j4kHIf')\n",
    "        if read_button.is_displayed():\n",
    "            read_button.click()\n",
    "#         else:\n",
    "#             browser.execute_script(\"div#t1_ditsc.scrollTo(0, 1)\") \n",
    "\n",
    "        # Select the large postings frame and parse data\n",
    "        html = (browser.find_element_by_id('tl_ditc')\n",
    "                       .get_attribute('innerHTML') )\n",
    "        job_data = parse_job(html)\n",
    "        job_data['url'] = browser.current_url\n",
    "\n",
    "        # Save data to dictionary and iterate variables\n",
    "        postings_data.append(job_data)\n",
    "        x += 1\n",
    "\n",
    "        # Update postings list if new ones are loaded\n",
    "        new_postings = browser.find_elements_by_css_selector('div.PaEvOc')\n",
    "        if len(new_postings) > total:\n",
    "            postings = new_postings\n",
    "            total = len(new_postings)\n",
    "    \n",
    "    # Close out the Selenium browser\n",
    "    browser.quit()\n",
    "    \n",
    "    return postings_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run the scraper for an \"all jobs\" URL\n",
    "u = 'https://www.google.com/search?q=data+science+jobs+nashville&ibp=htl;jobs&t=sq&li=20&st=0&jlo=all&t=sq&li=20&st=0&jlo=all#fpstate=tldetail&htichips=city:PZDrEzLsZIig2umh0Lk_fQ%3D%3D,job_family_1:data%20scientist,job_family_1:data%20analyst,job_family_1:data%20engineer&htidocid=f0G4_USBCrEe5x4VAAAAAA%3D%3D&htilrad=96.5604&htischips=city;PZDrEzLsZIig2umh0Lk_fQ%3D%3D:Nashville_comma_%20TN,job_family_1;data%20scientist;data%20analyst;data%20engineer&htivrt=jobs&st=0&t=sq'\n",
    "data = scrape_google_for_jobs(u, max_postings=500)\n",
    "\n",
    "# Dump to csv\n",
    "import pandas as pd\n",
    "df = pd.DataFrame(data)\n",
    "df.to_csv('./jobs_05-30-2018.csv', index=None)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:mcds]",
   "language": "python",
   "name": "conda-env-mcds-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Overview\n",
	"\n",
	"The \"Google for Jobs\" interface has a list of jobs on the left side of the screen, and a larger panel on the right side with full information about each job. We will perform a nightly scrape of the website, in which we filter for the past day's worth of postings. \n",
	"\n",
	"I used this [google-jobs-scraper](https://github.com/AliMahmoud7/google-jobs-scraper) repo as inspiration, but it is designed to scrape the Google Careers page, not the Google For Jobs page.\n",
	"\n",
	"#### Approach\n",
	"\n",
	"1. Select the URL from which to scrape using a manual query. Apply all necessary filters.\n",
	" - \"Data scientist\", \"data analyst\" or \"data engineer\"\n",
	" - Within 60 miles of Nashville, TN\n",
	" - Posted in the last day\n",
	"2. Use Selenium to click on each of the postings on the left. After clicking on each posting, refresh the list of available posts, since they do not all appear on the first click. \n",
	"3. For each clicked posting, parse the larger text on the right to get the available information. For many of these, the \"Read More\" button will need to be clicked.\n",
	" - DatePosted\n",
	" - JobTitle\n",
	" - Company\n",
	" - Location\n",
	" - HostSites\n",
	" - JobType (Full / Part / Internship)\n",
	" - JobDescription\n",
	" - JobWebsite (Google + Website)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"import requests\n",
	"from selenium import webdriver\n",
	"from selenium.webdriver.common.by import By\n",
	"from selenium.webdriver.support.ui import WebDriverWait\n",
	"from selenium.webdriver.support import expected_conditions as EC\n",
	"from selenium.common.exceptions import TimeoutException\n",
	"from bs4 import BeautifulSoup\n",
	"import json\n",
	"import time\n",
	"\n",
	"def prepare_url(url=None):\n",
	" default_url = 'https://www.google.com/search?q=data+science+jobs+nashville&ibp=htl;jobs&t=sq&li=20&st=0&jlo=all&t=sq&li=20&st=0&jlo=all#fpstate=tldetail&htichips=city:PZDrEzLsZIig2umh0Lk_fQ%3D%3D,job_family_1:data%20scientist,job_family_1:data%20analyst,job_family_1:data%20engineer,date_posted:today&htidocid=6WPSbX_Rgg6Q5dNoAAAAAA%3D%3D&htilrad=96.5604&htischips=city;PZDrEzLsZIig2umh0Lk_fQ%3D%3D:Nashville_comma_%20TN,job_family_1;data%20scientist;data%20analyst;data%20engineer,date_posted;today&htivrt=jobs&st=0&t=sq'\n",
	" if url is None:\n",
	" url = default_url \n",
	" params = {\n",
	" 't': 'sq',\n",
	" 'li': '20',\n",
	" 'st': '0',\n",
	" 'jlo': 'all'\n",
	" }\n",
	" result = requests.get(url=url, params=params)\n",
	" prep_url = result.url\n",
	" return url\n",
	"\n",
	"def prepare_browser():\n",
	" # Create Web Driver\n",
	" chrome_options = webdriver.ChromeOptions()\n",
	"# chrome_options.add_argument('headless')\n",
	"\n",
	" # Creates and open a new instance of the chrome driver\n",
	" path_to_chromedriver = './chromedriver.exe'\n",
	" browser = webdriver.Chrome(path_to_chromedriver,\n",
	" chrome_options=chrome_options)\n",
	" return browser\n",
	"\n",
	"\n",
	"def parse_job(html):\n",
	" soup = BeautifulSoup(html, 'html.parser')\n",
	"\n",
	" data = {}\n",
	" data['JobTitle'] = soup.select_one('div.fheAoc.nsol9b.RgAZAc').text\n",
	"\n",
	" header = soup.select('div.nsol9b.k8RiQ')\n",
	" try: \n",
	" data['Company'] = header[0].text\n",
	" data['City'] = header[1].text\n",
	" except IndexError:\n",
	" data['Company'] = None\n",
	" data['City'] = None\n",
	" \n",
	" tags = soup.select('div.Ug1maf.BbiuWb') \n",
	" try:\n",
	" data['PostDate'] = tags[0].text\n",
	" data['JobType'] = tags[1].text\n",
	" except IndexError:\n",
	" data['PostDate'] = None\n",
	" data['JobType'] = None\n",
	"\n",
	" try:\n",
	" data['JobDescription'] = soup.select_one('div.auGhcd').text\n",
	" except AttributeError:\n",
	" data['JobDescription'] = None\n",
	" \n",
	" return data\n",
	"\n",
	"\n",
	"def scrape_google_for_jobs(url=None, max_postings=30):\n",
	" url = prepare_url(url)\n",
	" browser = prepare_browser()\n",
	" \n",
	" # Open first URL\n",
	" browser.get(url)\n",
	"\n",
	" # Wait or sleep until all page data loaded\n",
	" WebDriverWait(browser, 10).until(\n",
	" EC.presence_of_all_elements_located((By.CLASS_NAME, 'PaEvOc'))\n",
	" )\n",
	"\n",
	" # Get initial page body\n",
	" body = browser.page_source\n",
	"\n",
	" # Parse page html to extract jobs info\n",
	" soup = BeautifulSoup(body, 'html.parser')\n",
	" postings = browser.find_elements_by_css_selector('div.PaEvOc')\n",
	"\n",
	" total = len(postings)\n",
	" x = 0\n",
	" postings_data = []\n",
	" while (x < total) and (x < max_postings):\n",
	" # Click on the next posting and expand the description\n",
	" postings[x].click()\n",
	" time.sleep(0.5)\n",
	" read_button = browser.find_element_by_css_selector('div.CdXzFe.j4kHIf')\n",
	" if read_button.is_displayed():\n",
	" read_button.click()\n",
	"# else:\n",
	"# browser.execute_script(\"div#t1_ditsc.scrollTo(0, 1)\") \n",
	"\n",
	" # Select the large postings frame and parse data\n",
	" html = (browser.find_element_by_id('tl_ditc')\n",
	" .get_attribute('innerHTML') )\n",
	" job_data = parse_job(html)\n",
	" job_data['url'] = browser.current_url\n",
	"\n",
	" # Save data to dictionary and iterate variables\n",
	" postings_data.append(job_data)\n",
	" x += 1\n",
	"\n",
	" # Update postings list if new ones are loaded\n",
	" new_postings = browser.find_elements_by_css_selector('div.PaEvOc')\n",
	" if len(new_postings) > total:\n",
	" postings = new_postings\n",
	" total = len(new_postings)\n",
	" \n",
	" # Close out the Selenium browser\n",
	" browser.quit()\n",
	" \n",
	" return postings_data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Run the scraper for an \"all jobs\" URL\n",
	"u = 'https://www.google.com/search?q=data+science+jobs+nashville&ibp=htl;jobs&t=sq&li=20&st=0&jlo=all&t=sq&li=20&st=0&jlo=all#fpstate=tldetail&htichips=city:PZDrEzLsZIig2umh0Lk_fQ%3D%3D,job_family_1:data%20scientist,job_family_1:data%20analyst,job_family_1:data%20engineer&htidocid=f0G4_USBCrEe5x4VAAAAAA%3D%3D&htilrad=96.5604&htischips=city;PZDrEzLsZIig2umh0Lk_fQ%3D%3D:Nashville_comma_%20TN,job_family_1;data%20scientist;data%20analyst;data%20engineer&htivrt=jobs&st=0&t=sq'\n",
	"data = scrape_google_for_jobs(u, max_postings=500)\n",
	"\n",
	"# Dump to csv\n",
	"import pandas as pd\n",
	"df = pd.DataFrame(data)\n",
	"df.to_csv('./jobs_05-30-2018.csv', index=None)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [conda env:mcds]",
	"language": "python",
	"name": "conda-env-mcds-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}