Skip to content

Instantly share code, notes, and snippets.

@aaronmauro
Created February 29, 2024 02:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aaronmauro/b2725befceb06ce08bbaffe155c80aa8 to your computer and use it in GitHub Desktop.
Save aaronmauro/b2725befceb06ce08bbaffe155c80aa8 to your computer and use it in GitHub Desktop.
A Python script to scrape Google Scholar.
#!/usr/bin/env python
# coding: utf-8
# # Scraping Google Scholar
#
# Sources: https://beautiful-soup-4.readthedocs.io/en/latest/ and https://requests.readthedocs.io/en/latest/
#
# User Agents: Available at https://www.useragents.me/.
# You may need to change your user agent from time to time.
# Just copy and paste to reflect your set up!
# In[1]:
"""
Warning: This software is for research and educational purposes only.
Please do not violate any local or international laws. Please do not break terms of
service with other entities. Please read and understand the code before using it.
"""
__author__ = "Aaron Mauro"
__role__ = "researcher"
__institution__ = "Brock University"
__email__ = "amauro@brocku.ca"
__status__ = "prototype/experiment"
__version__ = "0.1"
# In[2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
# In[3]:
# enter search query
query = input("Search Google Scholar: ")
query = query.lower().split()
query = "".join(["source:"+term+"+" for term in query]).rstrip("+") # builds the query format for the url
# In[4]:
user_agent = input("Paste user agent text from above site: ")
#such as: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36
# In[5]:
#define headers to pretend to be a browser
headers = {"User-Agent":f"{user_agent}","referer":"https://www.google.com/"}
# In[6]:
#create empty soup container
all_soup = BeautifulSoup()
# In[7]:
#run the first page, which is different than subsequent pages... to thwart us...
url = f"https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={query}&btnG=&oq=First"
response=requests.get(url,headers=headers)
soup=BeautifulSoup(response.content,'lxml')
all_soup.append(soup)
# In[ ]:
#now run page after page, marked by 10, 20, 30, 40. These represent page 1, 2, 3, 4, etc.
#note that the first page is 0, which we completed above
page = input("Enter page range in increments of ten: ")
url = f"https://scholar.google.com/scholar?start={page}&?q={query}+&hl=en&as_sdt=0,5"
response=requests.get(url,headers=headers)
soup=BeautifulSoup(response.content,'lxml')
all_soup.append(soup)
# In[ ]:
all_soup
# In[10]:
sources = all_soup.findAll("div", {"class": "gs_ri"})
# In[13]:
index = 0
collection = {}
for source in sources:
entry = {}
author = source.find("div",{"class":"gs_a"}).get_text()
author_index = author.index("-")
author = author[:author_index].strip().split(",")
title = source.find("h3").get_text().replace("[PDF]","").replace("[HTML]","").replace("[CITATION][C]","").strip()
abstract = source.find("div",{"class":"gs_rs"}).get_text().strip()
citation = "".join([ch for ch in source.find("div", {"class":"gs_fl gs_flb"}).get_text() if ch.isdigit()])
entry = {index:{"author":author,"title":title,"abstract":abstract,"citation":citation}}
index += 1
collection.update(entry)
# In[14]:
df = pd.DataFrame.from_dict(collection, orient="index")
# In[15]:
df.shape
# In[16]:
df.to_csv("./out.csv", encoding='utf-8', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment