Last active
December 8, 2020 19:31
-
-
Save yifeihuang/b1dee2c6c76ef10fd5cbbafd4da23521 to your computer and use it in GitHub Desktop.
Example code for calling SERP API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import json\n", | |
"import re\n", | |
"\n", | |
"import concurrent.futures\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import pendulum\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Parallel SERP API calls" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# may be used to clean up strings\n", | |
"def remove_special_char(s):\n", | |
" if pd.isnull(s):\n", | |
" return None\n", | |
" else:\n", | |
" return re.sub(r\"[^a-zA-Z0-9/. \\']+\", '', s)\n", | |
" \n", | |
"\n", | |
"# number of threads to run in parallel\n", | |
"THREADS = 16\n", | |
"\n", | |
"# function to run parallel API call on pandas dataframe\n", | |
"def parallel_apply(df, func, n_core = THREADS):\n", | |
" futs = []\n", | |
" df_split = np.array_split(df, n_core)\n", | |
" pool = concurrent.futures.ThreadPoolExecutor(max_workers = n_core)\n", | |
" return pd.concat(pool.map(func, df_split))\n", | |
"\n", | |
"\n", | |
"# the workhorse function that actually does the api call\n", | |
"# edit this to fit the specific query\n", | |
"def serp(row):\n", | |
" # set up the request parameters\n", | |
" query_str = row['name']\n", | |
" print('executing {}'.format(query_str))\n", | |
" params = {\n", | |
" 'api_key': 'your key',\n", | |
" 'q': 'site:linkedin.com {}'.format(query_str),\n", | |
" 'gl': 'us',\n", | |
" 'hl': 'en',\n", | |
" 'location': 'United States',\n", | |
" 'google_domain': 'google.com'\n", | |
" }\n", | |
" \n", | |
" \n", | |
" # make the http GET request to Scale SERP\n", | |
" api_result = requests.get('https://api.scaleserp.com/search', params)\n", | |
"\n", | |
" # # print the JSON response from Scale SERP\n", | |
" # print(json.dumps(api_result.json(), indent=4))\n", | |
" if api_result.status_code == 200:\n", | |
" return json.dumps(api_result.json())\n", | |
" else:\n", | |
" return None" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# read in df and process" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# data = pd.read_excel('Github Users.xlsx')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# make the workhorse function compatible with paralle apply\n", | |
"def apply_serp(df):\n", | |
" df['r_results'] = df.apply(serp, axis=1)\n", | |
" return df\n", | |
"\n", | |
"# final_target_response = parallel_apply(final_target, apply_serp, THREADS)\n", | |
"# final_target_response.to_excel('git_linkedin_search_raw.xlsx')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Parsing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def is_linkedin_slug(s):\n", | |
" if s is None:\n", | |
" return False\n", | |
" m = re.match(r\"\"\"^[\\-a-z0-9][a-z0-9\\&\\-\\.]{2,80}$\"\"\", s)\n", | |
" if m:\n", | |
" return True\n", | |
" else:\n", | |
" return False\n", | |
"\n", | |
"\n", | |
"def extract_linkedin_person_slug(url):\n", | |
" if url is None:\n", | |
" return None\n", | |
"\n", | |
"# url = unescape(url)\n", | |
" if is_linkedin_slug(url): return url\n", | |
"\n", | |
" m = re.match(r\"\"\"^((http(s)?:\\/\\/))?([a-z]{2,3}\\.)?linkedin\\.com\\/(in|pub)\\/(?P<slug>[a-z0-9][\\w0-9\\&\\-\\.]{2,80})([\\/\\?].*?)?$\"\"\", url)\n", | |
" if m is not None and 'pub/dir' not in url:\n", | |
" return m.group(\"slug\")\n", | |
" else:\n", | |
" return None\n", | |
"\n", | |
"def levenshtein(s1, s2):\n", | |
" if s1 == s2:\n", | |
" return 0\n", | |
"\n", | |
" l1 = len(s1)\n", | |
" l2 = len(s2)\n", | |
"\n", | |
" distances = []\n", | |
" for i in range(0, l1+1):\n", | |
" distances.append([0 for j in range(0, l2+1)])\n", | |
" for i in range(1, l1+1): # It will cost i to transform the first i characters of s1 to \"\"\n", | |
" distances[i][0] = i\n", | |
" for j in range(1, l2+1): # It will cost j to transform the first j characters of s2 to \"\"\n", | |
" distances[0][j] = j\n", | |
"\n", | |
" for i in range(1, l1+1):\n", | |
" for j in range(1, l2+1):\n", | |
" substitution_cost = 0 if s1[i-1] == s2[j-1] else 1 # If the letter at position i (index i-1) in s1 is the same as the letter at position j (index j-1), then there is no substitution cost\n", | |
" distances[i][j] = min([distances[i-1][j]+1, distances[i][j-1]+1, distances[i-1][j-1]+substitution_cost]) # insertion, deletion, substituion\n", | |
" return distances[l1][l2]\n", | |
"\n", | |
"def normalized_levenshtein(s1, s2, lev=None):\n", | |
" max_len = max([len(s1), len(s2)])*1.\n", | |
" if lev:\n", | |
" return lev/max_len\n", | |
" return levenshtein(s1,s2)/max_len\n", | |
"\n", | |
"def parse_serp(df):\n", | |
" for i, row in df.iterrows():\n", | |
" if pd.notnull(row['r_results']):\n", | |
" try:\n", | |
" parsed_response = json.loads(row['r_results'])\n", | |
" if 'organic_results' in parsed_response:\n", | |
" min_dist = 1\n", | |
" min_dist_ind = 0\n", | |
" results = parsed_response['organic_results']\n", | |
" for k in range(len(results)):\n", | |
" name = results[k]['title'].split(' - ')[0].strip()\n", | |
" dist = levenshtein(row['name'], name) / len(row['name'])\n", | |
" if extract_linkedin_person_slug(results[k]['link']) is None:\n", | |
" pass\n", | |
" elif dist == 0:\n", | |
" min_dist_ind = k\n", | |
" break\n", | |
" elif dist < min_dist:\n", | |
" min_dist = dist\n", | |
" min_dist_ind = k\n", | |
"\n", | |
" first_result = results[min_dist_ind]\n", | |
" if 'title' in first_result:\n", | |
" name = first_result['title'].split(' - ')[0].strip()\n", | |
" df.loc[i, 'r_name'] = name\n", | |
" df.loc[i, 'name_distance'] = levenshtein(row['name'], name) / len(row['name'])\n", | |
" df.loc[i, 'r_title'] = first_result['title']\n", | |
" if 'link' in first_result:\n", | |
" df.loc[i, 'r_link'] = first_result['link']\n", | |
" df.loc[i, 'slug'] = extract_linkedin_person_slug(first_result['link'])\n", | |
" if 'snippet' in first_result:\n", | |
" df.loc[i, 'r_snippet'] = first_result['snippet']\n", | |
" if 'rich_snippet' in first_result:\n", | |
" for j in range(len(first_result['rich_snippet']['top']['extensions'])):\n", | |
" df.loc[i, 'rich-{}'.format(j)] = first_result['rich_snippet']['top']['extensions'][j]\n", | |
" except ValueError as e:\n", | |
" print('parsing {} failed with {}'.format(row['r_results'], e))\n", | |
" return df\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# final_parsed_results = parse_serp(final_target_response)\n", | |
"# final_parsed_results.to_excel('github_linkedin_parsed.xlsx')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment