Skip to content

Instantly share code, notes, and snippets.

@stefandeml
Last active February 21, 2020 07:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stefandeml/689bd5d3cee414955a269a7e3928fdfb to your computer and use it in GitHub Desktop.
Save stefandeml/689bd5d3cee414955a269a7e3928fdfb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 381,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from time import sleep\n",
"import pandas as pd\n",
"import re\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 468,
"metadata": {},
"outputs": [],
"source": [
"# get driver for your browser version from here\n",
"driver = webdriver.Chrome('/Users/yubi/dev/dq/scraper/chromedriver')"
]
},
{
"cell_type": "code",
"execution_count": 384,
"metadata": {},
"outputs": [],
"source": [
"# Open Linkedin\n",
"driver.get('https://www.linkedin.com')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# Login (can also be done with the browser)\n",
"\n",
"username = driver.find_element_by_class_name(\"sign-in-form__inputs\")\n",
"\n",
"input_field = username.find_element_by_name(\"session_key\")\n",
"\n",
"input_field.send_keys(\"email@mail.com\")\n",
"\n",
"password_field = username.find_element_by_name(\"session_password\")\n",
"\n",
"password_field.send_keys(\"****\")\n",
"\n",
"button = driver.find_element_by_class_name(\"sign-in-form__submit-btn\")\n",
"\n",
"button.click()"
]
},
{
"cell_type": "code",
"execution_count": 373,
"metadata": {},
"outputs": [],
"source": [
"def parse_page():\n",
" core = driver.find_element_by_class_name(\"core-rail\")\n",
" records = {}\n",
" records[\"topcard\"] = core.find_element_by_class_name(\"pv-top-card--list\").text\n",
" records[\"highlights\"] = core.find_element_by_class_name(\"pv-highlights-section\").text\n",
" records[\"about\"] = core.find_element_by_class_name(\"pv-about-section\").text\n",
" return records\n",
"\n",
"def parse_topcard(topcard):\n",
" lines = topcard.split(\"\\n\")\n",
" return {\"name\": lines[0], \\\n",
" \"degree\": lines[2]}\n",
"\n",
"def get_single_re_group(match):\n",
" if match == None:\n",
" return None\n",
" else:\n",
" len(match.groups()) == 1 or print(\"Should be a single match\" + match.group())\n",
" return match.group(1)\n",
"\n",
"def parse_highlights(highlights):\n",
" number_mutal = re.search(\".*\\\\n([0-9]*) mutual\", highlights)\n",
" names_mutal = re.search(\"both know (.*)\", highlights)\n",
" \n",
" return {\"number_mutal\": get_single_re_group(number_mutal) , \\\n",
" \"names_mutal\": get_single_re_group(names_mutal)}\n",
"\n",
"def parse_about(about):\n",
" about = re.search(\"About\\\\n(.*)\", about)\n",
" \n",
" return {\"about\": get_single_re_group(about)}\n",
"\n",
"def run(email):\n",
" driver.get(email)\n",
" sleep(2.8)\n",
" records = parse_page()\n",
"\n",
" parsed_records = {\"email\": email,\n",
" **parse_topcard(records[\"topcard\"]),\n",
" **parse_about(records[\"about\"]),\n",
" **parse_highlights(records[\"highlights\"])}\n",
" return parsed_records\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 466,
"metadata": {},
"outputs": [],
"source": [
"# Read Outreach Excel\n",
"outreach = pd.read_excel('outreach.xlsx') \n",
"outreach.set_index(\"LinkedIn\", inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Define emails and run\n",
"emails = list(outreach.LinkedIn.values[207:270])\n",
"\n",
"store = [] # comment out when appending\n",
"for email in emails:\n",
" print(email)\n",
" try:\n",
" records = run(email)\n",
" store.append(records)\n",
" except:\n",
" print(\"Error:\", sys.exc_info()[0])"
]
},
{
"cell_type": "code",
"execution_count": 400,
"metadata": {},
"outputs": [],
"source": [
"# Postprocess \n",
"pd = pd.DataFrame(store)\n",
"pd_dedup = pd.drop_duplicates()\n",
"pd_dedup = pd_dedup[~new_store.email.duplicated()]\n",
"\n",
"merged = outreach.merge(pd_dedup, left_index=True, right_on=\"email\", how=\"left\")\n",
"merged"
]
},
{
"cell_type": "code",
"execution_count": 465,
"metadata": {},
"outputs": [],
"source": [
"merged.to_excel(\"merged.xlsx\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5rc1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment