Last active
February 21, 2020 07:48
-
-
Save stefandeml/689bd5d3cee414955a269a7e3928fdfb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 381, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from selenium import webdriver\n", | |
"from time import sleep\n", | |
"import pandas as pd\n", | |
"import re\n", | |
"import sys" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 468, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# get driver for your browser version from here\n", | |
"driver = webdriver.Chrome('/Users/yubi/dev/dq/scraper/chromedriver')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 384, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Open Linkedin\n", | |
"driver.get('https://www.linkedin.com')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Login (can also be done with the browser)\n", | |
"\n", | |
"username = driver.find_element_by_class_name(\"sign-in-form__inputs\")\n", | |
"\n", | |
"input_field = username.find_element_by_name(\"session_key\")\n", | |
"\n", | |
"input_field.send_keys(\"email@mail.com\")\n", | |
"\n", | |
"password_field = username.find_element_by_name(\"session_password\")\n", | |
"\n", | |
"password_field.send_keys(\"****\")\n", | |
"\n", | |
"button = driver.find_element_by_class_name(\"sign-in-form__submit-btn\")\n", | |
"\n", | |
"button.click()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 373, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def parse_page():\n", | |
" core = driver.find_element_by_class_name(\"core-rail\")\n", | |
" records = {}\n", | |
" records[\"topcard\"] = core.find_element_by_class_name(\"pv-top-card--list\").text\n", | |
" records[\"highlights\"] = core.find_element_by_class_name(\"pv-highlights-section\").text\n", | |
" records[\"about\"] = core.find_element_by_class_name(\"pv-about-section\").text\n", | |
" return records\n", | |
"\n", | |
"def parse_topcard(topcard):\n", | |
" lines = topcard.split(\"\\n\")\n", | |
" return {\"name\": lines[0], \\\n", | |
" \"degree\": lines[2]}\n", | |
"\n", | |
"def get_single_re_group(match):\n", | |
" if match == None:\n", | |
" return None\n", | |
" else:\n", | |
" len(match.groups()) == 1 or print(\"Should be a single match\" + match.group())\n", | |
" return match.group(1)\n", | |
"\n", | |
"def parse_highlights(highlights):\n", | |
" number_mutal = re.search(\".*\\\\n([0-9]*) mutual\", highlights)\n", | |
" names_mutal = re.search(\"both know (.*)\", highlights)\n", | |
" \n", | |
" return {\"number_mutal\": get_single_re_group(number_mutal) , \\\n", | |
" \"names_mutal\": get_single_re_group(names_mutal)}\n", | |
"\n", | |
"def parse_about(about):\n", | |
" about = re.search(\"About\\\\n(.*)\", about)\n", | |
" \n", | |
" return {\"about\": get_single_re_group(about)}\n", | |
"\n", | |
"def run(email):\n", | |
" driver.get(email)\n", | |
" sleep(2.8)\n", | |
" records = parse_page()\n", | |
"\n", | |
" parsed_records = {\"email\": email,\n", | |
" **parse_topcard(records[\"topcard\"]),\n", | |
" **parse_about(records[\"about\"]),\n", | |
" **parse_highlights(records[\"highlights\"])}\n", | |
" return parsed_records\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 466, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Read Outreach Excel\n", | |
"outreach = pd.read_excel('outreach.xlsx') \n", | |
"outreach.set_index(\"LinkedIn\", inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define emails and run\n", | |
"emails = list(outreach.LinkedIn.values[207:270])\n", | |
"\n", | |
"store = [] # comment out when appending\n", | |
"for email in emails:\n", | |
" print(email)\n", | |
" try:\n", | |
" records = run(email)\n", | |
" store.append(records)\n", | |
" except:\n", | |
" print(\"Error:\", sys.exc_info()[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 400, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Postprocess \n", | |
"pd = pd.DataFrame(store)\n", | |
"pd_dedup = pd.drop_duplicates()\n", | |
"pd_dedup = pd_dedup[~new_store.email.duplicated()]\n", | |
"\n", | |
"merged = outreach.merge(pd_dedup, left_index=True, right_on=\"email\", how=\"left\")\n", | |
"merged" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 465, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"merged.to_excel(\"merged.xlsx\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.5rc1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment