Skip to content

Instantly share code, notes, and snippets.

@yifeihuang
Last active December 8, 2020 19:31
Show Gist options
  • Save yifeihuang/b1dee2c6c76ef10fd5cbbafd4da23521 to your computer and use it in GitHub Desktop.
Save yifeihuang/b1dee2c6c76ef10fd5cbbafd4da23521 to your computer and use it in GitHub Desktop.
Example code for calling SERP API
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"import re\n",
"\n",
"import concurrent.futures\n",
"import numpy as np\n",
"import pandas as pd\n",
"import pendulum\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parallel SERP API calls"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# may be used to clean up strings\n",
"def remove_special_char(s):\n",
" if pd.isnull(s):\n",
" return None\n",
" else:\n",
" return re.sub(r\"[^a-zA-Z0-9/. \\']+\", '', s)\n",
" \n",
"\n",
"# number of threads to run in parallel\n",
"THREADS = 16\n",
"\n",
"# function to run parallel API call on pandas dataframe\n",
"def parallel_apply(df, func, n_core = THREADS):\n",
" futs = []\n",
" df_split = np.array_split(df, n_core)\n",
" pool = concurrent.futures.ThreadPoolExecutor(max_workers = n_core)\n",
" return pd.concat(pool.map(func, df_split))\n",
"\n",
"\n",
"# the workhorse function that actually does the api call\n",
"# edit this to fit the specific query\n",
"def serp(row):\n",
" # set up the request parameters\n",
" query_str = row['name']\n",
" print('executing {}'.format(query_str))\n",
" params = {\n",
" 'api_key': 'your key',\n",
" 'q': 'site:linkedin.com {}'.format(query_str),\n",
" 'gl': 'us',\n",
" 'hl': 'en',\n",
" 'location': 'United States',\n",
" 'google_domain': 'google.com'\n",
" }\n",
" \n",
" \n",
" # make the http GET request to Scale SERP\n",
" api_result = requests.get('https://api.scaleserp.com/search', params)\n",
"\n",
" # # print the JSON response from Scale SERP\n",
" # print(json.dumps(api_result.json(), indent=4))\n",
" if api_result.status_code == 200:\n",
" return json.dumps(api_result.json())\n",
" else:\n",
" return None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# read in df and process"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# data = pd.read_excel('Github Users.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# make the workhorse function compatible with paralle apply\n",
"def apply_serp(df):\n",
" df['r_results'] = df.apply(serp, axis=1)\n",
" return df\n",
"\n",
"# final_target_response = parallel_apply(final_target, apply_serp, THREADS)\n",
"# final_target_response.to_excel('git_linkedin_search_raw.xlsx')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parsing"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def is_linkedin_slug(s):\n",
" if s is None:\n",
" return False\n",
" m = re.match(r\"\"\"^[\\-a-z0-9][a-z0-9\\&\\-\\.]{2,80}$\"\"\", s)\n",
" if m:\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"\n",
"def extract_linkedin_person_slug(url):\n",
" if url is None:\n",
" return None\n",
"\n",
"# url = unescape(url)\n",
" if is_linkedin_slug(url): return url\n",
"\n",
" m = re.match(r\"\"\"^((http(s)?:\\/\\/))?([a-z]{2,3}\\.)?linkedin\\.com\\/(in|pub)\\/(?P<slug>[a-z0-9][\\w0-9\\&\\-\\.]{2,80})([\\/\\?].*?)?$\"\"\", url)\n",
" if m is not None and 'pub/dir' not in url:\n",
" return m.group(\"slug\")\n",
" else:\n",
" return None\n",
"\n",
"def levenshtein(s1, s2):\n",
" if s1 == s2:\n",
" return 0\n",
"\n",
" l1 = len(s1)\n",
" l2 = len(s2)\n",
"\n",
" distances = []\n",
" for i in range(0, l1+1):\n",
" distances.append([0 for j in range(0, l2+1)])\n",
" for i in range(1, l1+1): # It will cost i to transform the first i characters of s1 to \"\"\n",
" distances[i][0] = i\n",
" for j in range(1, l2+1): # It will cost j to transform the first j characters of s2 to \"\"\n",
" distances[0][j] = j\n",
"\n",
" for i in range(1, l1+1):\n",
" for j in range(1, l2+1):\n",
" substitution_cost = 0 if s1[i-1] == s2[j-1] else 1 # If the letter at position i (index i-1) in s1 is the same as the letter at position j (index j-1), then there is no substitution cost\n",
" distances[i][j] = min([distances[i-1][j]+1, distances[i][j-1]+1, distances[i-1][j-1]+substitution_cost]) # insertion, deletion, substituion\n",
" return distances[l1][l2]\n",
"\n",
"def normalized_levenshtein(s1, s2, lev=None):\n",
" max_len = max([len(s1), len(s2)])*1.\n",
" if lev:\n",
" return lev/max_len\n",
" return levenshtein(s1,s2)/max_len\n",
"\n",
"def parse_serp(df):\n",
" for i, row in df.iterrows():\n",
" if pd.notnull(row['r_results']):\n",
" try:\n",
" parsed_response = json.loads(row['r_results'])\n",
" if 'organic_results' in parsed_response:\n",
" min_dist = 1\n",
" min_dist_ind = 0\n",
" results = parsed_response['organic_results']\n",
" for k in range(len(results)):\n",
" name = results[k]['title'].split(' - ')[0].strip()\n",
" dist = levenshtein(row['name'], name) / len(row['name'])\n",
" if extract_linkedin_person_slug(results[k]['link']) is None:\n",
" pass\n",
" elif dist == 0:\n",
" min_dist_ind = k\n",
" break\n",
" elif dist < min_dist:\n",
" min_dist = dist\n",
" min_dist_ind = k\n",
"\n",
" first_result = results[min_dist_ind]\n",
" if 'title' in first_result:\n",
" name = first_result['title'].split(' - ')[0].strip()\n",
" df.loc[i, 'r_name'] = name\n",
" df.loc[i, 'name_distance'] = levenshtein(row['name'], name) / len(row['name'])\n",
" df.loc[i, 'r_title'] = first_result['title']\n",
" if 'link' in first_result:\n",
" df.loc[i, 'r_link'] = first_result['link']\n",
" df.loc[i, 'slug'] = extract_linkedin_person_slug(first_result['link'])\n",
" if 'snippet' in first_result:\n",
" df.loc[i, 'r_snippet'] = first_result['snippet']\n",
" if 'rich_snippet' in first_result:\n",
" for j in range(len(first_result['rich_snippet']['top']['extensions'])):\n",
" df.loc[i, 'rich-{}'.format(j)] = first_result['rich_snippet']['top']['extensions'][j]\n",
" except ValueError as e:\n",
" print('parsing {} failed with {}'.format(row['r_results'], e))\n",
" return df\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# final_parsed_results = parse_serp(final_target_response)\n",
"# final_parsed_results.to_excel('github_linkedin_parsed.xlsx')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment