Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimitryzub/b67e3655e663b42fec41ec342d0a1482 to your computer and use it in GitHub Desktop.
Save dimitryzub/b67e3655e663b42fec41ec342d0a1482 to your computer and use it in GitHub Desktop.
Web Scraping Google Scholar Organic, Cite Results to CSV with Python | SerpApi
# Video tutorial - https://www.youtube.com/watch?v=IXcycQwpFH0
# https://serpapi.com/google-scholar-api
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
import os, json
def scrape_organic_results():
params = {
# if you calling script file from command line, pass your SerpApi key.
# Example: API_KEY=<your-api-key> python your-script-file.py
"api_key": os.getenv("API_KEY"), # your serpapi api key
"engine": "google_scholar", # search engine
"q": "minecraft education benefits statistics review x-ray", # search query
"hl": "en", # language
}
search = GoogleSearch(params)
organic_results_data = []
while True:
results = search.get_dict()
print(f"Extracting publications from page #{results.get('serpapi_pagination', {}).get('current')}.")
for result in results.get("organic_results", {}):
position = result.get("position")
title = result.get("title")
publication_info_summary = result.get("publication_info", {}).get("summary")
result_id = result["result_id"]
link = result.get("link")
result_type = result.get("type")
snippet = result.get("snippet")
try:
file_title = result["resources"][0]["title"]
except: file_title = None
try:
file_link = result["resources"][0]["link"]
except: file_link = None
try:
file_format = result["resources"][0]["file_format"]
except: file_format = None
try:
cited_by_count = int(result["inline_links"]["cited_by"]["total"])
except: cited_by_count = None
cited_by_id = result.get("inline_links", {}).get("cited_by", {}).get("cites_id", {})
cited_by_link = result.get("inline_links", {}).get("cited_by", {}).get("link", {})
try:
total_versions = int(result["inline_links"]["versions"]["total"])
except: total_versions = None
all_versions_link = result.get("inline_links", {}).get("versions", {}).get("link", {})
all_versions_id = result.get("inline_links", {}).get("versions", {}).get("cluster_id", {})
organic_results_data.append({
"page_number": results.get("serpapi_pagination", {}).get("current"),
"position": position + 1,
"result_type": result_type,
"title": title,
"link": link,
"result_id": result_id,
"publication_info_summary": publication_info_summary,
"snippet": snippet,
"cited_by_count": cited_by_count,
"cited_by_link": cited_by_link,
"cited_by_id": cited_by_id,
"total_versions": total_versions,
"all_versions_link": all_versions_link,
"all_versions_id": all_versions_id,
"file_format": file_format,
"file_title": file_title,
"file_link": file_link,
})
if "next" in results.get("serpapi_pagination", {}):
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
break
return organic_results_data
def scrape_cite_results():
citations = []
for citation_id in scrape_organic_results():
params = {
"api_key": os.getenv("API_KEY"), # your serpapi api key
"engine": "google_scholar_cite", # search engine
"q": citation_id["result_id"] # citation ID
}
search = GoogleSearch(params)
results = search.get_dict()
print(f"Extracting {citation_id.get('result_id')} citation ID.")
for citation in results.get("citations", []):
citations.append({
"organic_result_title": citation_id.get("title"),
"citation_title": citation.get("title"),
"citation_snippet": citation.get("snippet")
})
return citations
def save_organic_results_to_csv():
print("waiting for organic results to save..")
organic_df = pd.DataFrame(data=scrape_organic_results())
organic_df.to_csv("google_scholar_organic_results.csv", encoding="utf-8")
def save_cite_results_to_csv():
print("waiting for cite results to save..")
cite_df = pd.DataFrame(data=scrape_cite_results())
cite_df.to_csv("google_scholar_citation_results.csv", encoding="utf-8", index=False)
save_organic_results_to_csv()
save_cite_results_to_csv()
@ilyazub
Copy link

ilyazub commented Oct 25, 2022

Sometimes it results in KeyError: 'citations'.

KeyError: 'citations'

Fix:

- for citation in results["citations"]:
+ for citation in results.get("citations", []):

Result:

image

@dimitryzub
Copy link
Author

dimitryzub commented Oct 25, 2022

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment