yifeihuang/serp_template.ipynb

## serp_template.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "import re\n",
    "\n",
    "import concurrent.futures\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pendulum\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parallel SERP API calls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# may be used to clean up strings\n",
    "def remove_special_char(s):\n",
    "    if pd.isnull(s):\n",
    "        return None\n",
    "    else:\n",
    "        return re.sub(r\"[^a-zA-Z0-9/. \\']+\", '', s)\n",
    "    \n",
    "\n",
    "# number of threads to run in parallel\n",
    "THREADS = 16\n",
    "\n",
    "# function to run parallel API call on pandas dataframe\n",
    "def parallel_apply(df, func, n_core = THREADS):\n",
    "    futs = []\n",
    "    df_split = np.array_split(df, n_core)\n",
    "    pool = concurrent.futures.ThreadPoolExecutor(max_workers = n_core)\n",
    "    return pd.concat(pool.map(func, df_split))\n",
    "\n",
    "\n",
    "# the workhorse function that actually does the api call\n",
    "# edit this to fit the specific query\n",
    "def serp(row):\n",
    "    # set up the request parameters\n",
    "    query_str = row['name']\n",
    "    print('executing {}'.format(query_str))\n",
    "    params = {\n",
    "      'api_key': 'your key',\n",
    "      'q': 'site:linkedin.com {}'.format(query_str),\n",
    "      'gl': 'us',\n",
    "      'hl': 'en',\n",
    "      'location': 'United States',\n",
    "      'google_domain': 'google.com'\n",
    "    }\n",
    "    \n",
    "    \n",
    "    # make the http GET request to Scale SERP\n",
    "    api_result = requests.get('https://api.scaleserp.com/search', params)\n",
    "\n",
    "    # # print the JSON response from Scale SERP\n",
    "    # print(json.dumps(api_result.json(), indent=4))\n",
    "    if api_result.status_code == 200:\n",
    "        return json.dumps(api_result.json())\n",
    "    else:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# read in df and process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data = pd.read_excel('Github Users.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make the workhorse function compatible with paralle apply\n",
    "def apply_serp(df):\n",
    "    df['r_results'] = df.apply(serp, axis=1)\n",
    "    return df\n",
    "\n",
    "# final_target_response = parallel_apply(final_target, apply_serp, THREADS)\n",
    "# final_target_response.to_excel('git_linkedin_search_raw.xlsx')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parsing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_linkedin_slug(s):\n",
    "  if s is None:\n",
    "    return False\n",
    "  m = re.match(r\"\"\"^[\\-a-z0-9][a-z0-9\\&\\-\\.]{2,80}$\"\"\", s)\n",
    "  if m:\n",
    "    return True\n",
    "  else:\n",
    "    return False\n",
    "\n",
    "\n",
    "def extract_linkedin_person_slug(url):\n",
    "  if url is None:\n",
    "    return None\n",
    "\n",
    "#   url = unescape(url)\n",
    "  if is_linkedin_slug(url): return url\n",
    "\n",
    "  m = re.match(r\"\"\"^((http(s)?:\\/\\/))?([a-z]{2,3}\\.)?linkedin\\.com\\/(in|pub)\\/(?P<slug>[a-z0-9][\\w0-9\\&\\-\\.]{2,80})([\\/\\?].*?)?$\"\"\", url)\n",
    "  if m is not None and 'pub/dir' not in url:\n",
    "    return m.group(\"slug\")\n",
    "  else:\n",
    "    return None\n",
    "\n",
    "def levenshtein(s1, s2):\n",
    "  if s1 == s2:\n",
    "    return 0\n",
    "\n",
    "  l1 = len(s1)\n",
    "  l2 = len(s2)\n",
    "\n",
    "  distances = []\n",
    "  for i in range(0, l1+1):\n",
    "    distances.append([0 for j in range(0, l2+1)])\n",
    "  for i in range(1, l1+1):  # It will cost i to transform the first i characters of s1 to \"\"\n",
    "    distances[i][0] = i\n",
    "  for j in range(1, l2+1):  # It will cost j to transform the first j characters of s2 to \"\"\n",
    "    distances[0][j] = j\n",
    "\n",
    "  for i in range(1, l1+1):\n",
    "    for j in range(1, l2+1):\n",
    "      substitution_cost = 0 if s1[i-1] == s2[j-1] else 1  # If the letter at position i (index i-1) in s1 is the same as the letter at position j (index j-1), then there is no substitution cost\n",
    "      distances[i][j] = min([distances[i-1][j]+1, distances[i][j-1]+1, distances[i-1][j-1]+substitution_cost])  # insertion, deletion, substituion\n",
    "  return distances[l1][l2]\n",
    "\n",
    "def normalized_levenshtein(s1, s2, lev=None):\n",
    "  max_len = max([len(s1), len(s2)])*1.\n",
    "  if lev:\n",
    "    return lev/max_len\n",
    "  return levenshtein(s1,s2)/max_len\n",
    "\n",
    "def parse_serp(df):\n",
    "    for i, row in df.iterrows():\n",
    "        if pd.notnull(row['r_results']):\n",
    "            try:\n",
    "                parsed_response = json.loads(row['r_results'])\n",
    "                if 'organic_results' in parsed_response:\n",
    "                    min_dist = 1\n",
    "                    min_dist_ind = 0\n",
    "                    results = parsed_response['organic_results']\n",
    "                    for k in range(len(results)):\n",
    "                        name = results[k]['title'].split(' - ')[0].strip()\n",
    "                        dist = levenshtein(row['name'], name) / len(row['name'])\n",
    "                        if extract_linkedin_person_slug(results[k]['link']) is None:\n",
    "                            pass\n",
    "                        elif dist == 0:\n",
    "                            min_dist_ind = k\n",
    "                            break\n",
    "                        elif dist < min_dist:\n",
    "                            min_dist = dist\n",
    "                            min_dist_ind = k\n",
    "\n",
    "                    first_result = results[min_dist_ind]\n",
    "                    if 'title' in first_result:\n",
    "                        name = first_result['title'].split(' - ')[0].strip()\n",
    "                        df.loc[i, 'r_name'] = name\n",
    "                        df.loc[i, 'name_distance'] = levenshtein(row['name'], name) / len(row['name'])\n",
    "                        df.loc[i, 'r_title'] = first_result['title']\n",
    "                    if 'link' in first_result:\n",
    "                        df.loc[i, 'r_link'] = first_result['link']\n",
    "                        df.loc[i, 'slug'] = extract_linkedin_person_slug(first_result['link'])\n",
    "                    if 'snippet' in first_result:\n",
    "                        df.loc[i, 'r_snippet'] = first_result['snippet']\n",
    "                    if 'rich_snippet' in first_result:\n",
    "                        for j in range(len(first_result['rich_snippet']['top']['extensions'])):\n",
    "                            df.loc[i, 'rich-{}'.format(j)] = first_result['rich_snippet']['top']['extensions'][j]\n",
    "            except ValueError as e:\n",
    "                print('parsing {} failed with {}'.format(row['r_results'], e))\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# final_parsed_results = parse_serp(final_target_response)\n",
    "# final_parsed_results.to_excel('github_linkedin_parsed.xlsx')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import requests\n",
	"import json\n",
	"import re\n",
	"\n",
	"import concurrent.futures\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import pendulum\n",
	"\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Parallel SERP API calls"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"# may be used to clean up strings\n",
	"def remove_special_char(s):\n",
	" if pd.isnull(s):\n",
	" return None\n",
	" else:\n",
	" return re.sub(r\"[^a-zA-Z0-9/. \\']+\", '', s)\n",
	" \n",
	"\n",
	"# number of threads to run in parallel\n",
	"THREADS = 16\n",
	"\n",
	"# function to run parallel API call on pandas dataframe\n",
	"def parallel_apply(df, func, n_core = THREADS):\n",
	" futs = []\n",
	" df_split = np.array_split(df, n_core)\n",
	" pool = concurrent.futures.ThreadPoolExecutor(max_workers = n_core)\n",
	" return pd.concat(pool.map(func, df_split))\n",
	"\n",
	"\n",
	"# the workhorse function that actually does the api call\n",
	"# edit this to fit the specific query\n",
	"def serp(row):\n",
	" # set up the request parameters\n",
	" query_str = row['name']\n",
	" print('executing {}'.format(query_str))\n",
	" params = {\n",
	" 'api_key': 'your key',\n",
	" 'q': 'site:linkedin.com {}'.format(query_str),\n",
	" 'gl': 'us',\n",
	" 'hl': 'en',\n",
	" 'location': 'United States',\n",
	" 'google_domain': 'google.com'\n",
	" }\n",
	" \n",
	" \n",
	" # make the http GET request to Scale SERP\n",
	" api_result = requests.get('https://api.scaleserp.com/search', params)\n",
	"\n",
	" # # print the JSON response from Scale SERP\n",
	" # print(json.dumps(api_result.json(), indent=4))\n",
	" if api_result.status_code == 200:\n",
	" return json.dumps(api_result.json())\n",
	" else:\n",
	" return None"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# read in df and process"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# data = pd.read_excel('Github Users.xlsx')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# make the workhorse function compatible with paralle apply\n",
	"def apply_serp(df):\n",
	" df['r_results'] = df.apply(serp, axis=1)\n",
	" return df\n",
	"\n",
	"# final_target_response = parallel_apply(final_target, apply_serp, THREADS)\n",
	"# final_target_response.to_excel('git_linkedin_search_raw.xlsx')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Parsing"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"def is_linkedin_slug(s):\n",
	" if s is None:\n",
	" return False\n",
	" m = re.match(r\"\"\"^[\\-a-z0-9][a-z0-9\\&\\-\\.]{2,80}$\"\"\", s)\n",
	" if m:\n",
	" return True\n",
	" else:\n",
	" return False\n",
	"\n",
	"\n",
	"def extract_linkedin_person_slug(url):\n",
	" if url is None:\n",
	" return None\n",
	"\n",
	"# url = unescape(url)\n",
	" if is_linkedin_slug(url): return url\n",
	"\n",
	" m = re.match(r\"\"\"^((http(s)?:\\/\\/))?([a-z]{2,3}\\.)?linkedin\\.com\\/(in\|pub)\\/(?P<slug>[a-z0-9][\\w0-9\\&\\-\\.]{2,80})([\\/\\?].*?)?$\"\"\", url)\n",
	" if m is not None and 'pub/dir' not in url:\n",
	" return m.group(\"slug\")\n",
	" else:\n",
	" return None\n",
	"\n",
	"def levenshtein(s1, s2):\n",
	" if s1 == s2:\n",
	" return 0\n",
	"\n",
	" l1 = len(s1)\n",
	" l2 = len(s2)\n",
	"\n",
	" distances = []\n",
	" for i in range(0, l1+1):\n",
	" distances.append([0 for j in range(0, l2+1)])\n",
	" for i in range(1, l1+1): # It will cost i to transform the first i characters of s1 to \"\"\n",
	" distances[i][0] = i\n",
	" for j in range(1, l2+1): # It will cost j to transform the first j characters of s2 to \"\"\n",
	" distances[0][j] = j\n",
	"\n",
	" for i in range(1, l1+1):\n",
	" for j in range(1, l2+1):\n",
	" substitution_cost = 0 if s1[i-1] == s2[j-1] else 1 # If the letter at position i (index i-1) in s1 is the same as the letter at position j (index j-1), then there is no substitution cost\n",
	" distances[i][j] = min([distances[i-1][j]+1, distances[i][j-1]+1, distances[i-1][j-1]+substitution_cost]) # insertion, deletion, substituion\n",
	" return distances[l1][l2]\n",
	"\n",
	"def normalized_levenshtein(s1, s2, lev=None):\n",
	" max_len = max([len(s1), len(s2)])*1.\n",
	" if lev:\n",
	" return lev/max_len\n",
	" return levenshtein(s1,s2)/max_len\n",
	"\n",
	"def parse_serp(df):\n",
	" for i, row in df.iterrows():\n",
	" if pd.notnull(row['r_results']):\n",
	" try:\n",
	" parsed_response = json.loads(row['r_results'])\n",
	" if 'organic_results' in parsed_response:\n",
	" min_dist = 1\n",
	" min_dist_ind = 0\n",
	" results = parsed_response['organic_results']\n",
	" for k in range(len(results)):\n",
	" name = results[k]['title'].split(' - ')[0].strip()\n",
	" dist = levenshtein(row['name'], name) / len(row['name'])\n",
	" if extract_linkedin_person_slug(results[k]['link']) is None:\n",
	" pass\n",
	" elif dist == 0:\n",
	" min_dist_ind = k\n",
	" break\n",
	" elif dist < min_dist:\n",
	" min_dist = dist\n",
	" min_dist_ind = k\n",
	"\n",
	" first_result = results[min_dist_ind]\n",
	" if 'title' in first_result:\n",
	" name = first_result['title'].split(' - ')[0].strip()\n",
	" df.loc[i, 'r_name'] = name\n",
	" df.loc[i, 'name_distance'] = levenshtein(row['name'], name) / len(row['name'])\n",
	" df.loc[i, 'r_title'] = first_result['title']\n",
	" if 'link' in first_result:\n",
	" df.loc[i, 'r_link'] = first_result['link']\n",
	" df.loc[i, 'slug'] = extract_linkedin_person_slug(first_result['link'])\n",
	" if 'snippet' in first_result:\n",
	" df.loc[i, 'r_snippet'] = first_result['snippet']\n",
	" if 'rich_snippet' in first_result:\n",
	" for j in range(len(first_result['rich_snippet']['top']['extensions'])):\n",
	" df.loc[i, 'rich-{}'.format(j)] = first_result['rich_snippet']['top']['extensions'][j]\n",
	" except ValueError as e:\n",
	" print('parsing {} failed with {}'.format(row['r_results'], e))\n",
	" return df\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# final_parsed_results = parse_serp(final_target_response)\n",
	"# final_parsed_results.to_excel('github_linkedin_parsed.xlsx')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}