Last active
March 26, 2024 02:01
-
-
Save dimitryzub/b67e3655e663b42fec41ec342d0a1482 to your computer and use it in GitHub Desktop.
Web Scraping Google Scholar Organic, Cite Results to CSV with Python | SerpApi
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Video tutorial - https://www.youtube.com/watch?v=IXcycQwpFH0 | |
# https://serpapi.com/google-scholar-api | |
from serpapi import GoogleSearch | |
from urllib.parse import urlsplit, parse_qsl | |
import pandas as pd | |
import os, json | |
def scrape_organic_results(): | |
params = { | |
# if you calling script file from command line, pass your SerpApi key. | |
# Example: API_KEY=<your-api-key> python your-script-file.py | |
"api_key": os.getenv("API_KEY"), # your serpapi api key | |
"engine": "google_scholar", # search engine | |
"q": "minecraft education benefits statistics review x-ray", # search query | |
"hl": "en", # language | |
} | |
search = GoogleSearch(params) | |
organic_results_data = [] | |
while True: | |
results = search.get_dict() | |
print(f"Extracting publications from page #{results.get('serpapi_pagination', {}).get('current')}.") | |
for result in results.get("organic_results", {}): | |
position = result.get("position") | |
title = result.get("title") | |
publication_info_summary = result.get("publication_info", {}).get("summary") | |
result_id = result["result_id"] | |
link = result.get("link") | |
result_type = result.get("type") | |
snippet = result.get("snippet") | |
try: | |
file_title = result["resources"][0]["title"] | |
except: file_title = None | |
try: | |
file_link = result["resources"][0]["link"] | |
except: file_link = None | |
try: | |
file_format = result["resources"][0]["file_format"] | |
except: file_format = None | |
try: | |
cited_by_count = int(result["inline_links"]["cited_by"]["total"]) | |
except: cited_by_count = None | |
cited_by_id = result.get("inline_links", {}).get("cited_by", {}).get("cites_id", {}) | |
cited_by_link = result.get("inline_links", {}).get("cited_by", {}).get("link", {}) | |
try: | |
total_versions = int(result["inline_links"]["versions"]["total"]) | |
except: total_versions = None | |
all_versions_link = result.get("inline_links", {}).get("versions", {}).get("link", {}) | |
all_versions_id = result.get("inline_links", {}).get("versions", {}).get("cluster_id", {}) | |
organic_results_data.append({ | |
"page_number": results.get("serpapi_pagination", {}).get("current"), | |
"position": position + 1, | |
"result_type": result_type, | |
"title": title, | |
"link": link, | |
"result_id": result_id, | |
"publication_info_summary": publication_info_summary, | |
"snippet": snippet, | |
"cited_by_count": cited_by_count, | |
"cited_by_link": cited_by_link, | |
"cited_by_id": cited_by_id, | |
"total_versions": total_versions, | |
"all_versions_link": all_versions_link, | |
"all_versions_id": all_versions_id, | |
"file_format": file_format, | |
"file_title": file_title, | |
"file_link": file_link, | |
}) | |
if "next" in results.get("serpapi_pagination", {}): | |
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query))) | |
else: | |
break | |
return organic_results_data | |
def scrape_cite_results(): | |
citations = [] | |
for citation_id in scrape_organic_results(): | |
params = { | |
"api_key": os.getenv("API_KEY"), # your serpapi api key | |
"engine": "google_scholar_cite", # search engine | |
"q": citation_id["result_id"] # citation ID | |
} | |
search = GoogleSearch(params) | |
results = search.get_dict() | |
print(f"Extracting {citation_id.get('result_id')} citation ID.") | |
for citation in results.get("citations", []): | |
citations.append({ | |
"organic_result_title": citation_id.get("title"), | |
"citation_title": citation.get("title"), | |
"citation_snippet": citation.get("snippet") | |
}) | |
return citations | |
def save_organic_results_to_csv(): | |
print("waiting for organic results to save..") | |
organic_df = pd.DataFrame(data=scrape_organic_results()) | |
organic_df.to_csv("google_scholar_organic_results.csv", encoding="utf-8") | |
def save_cite_results_to_csv(): | |
print("waiting for cite results to save..") | |
cite_df = pd.DataFrame(data=scrape_cite_results()) | |
cite_df.to_csv("google_scholar_citation_results.csv", encoding="utf-8", index=False) | |
save_organic_results_to_csv() | |
save_cite_results_to_csv() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@ilyazub Thank you 🧡
I've updated the Gist.