Created
February 29, 2024 02:14
-
-
Save aaronmauro/b2725befceb06ce08bbaffe155c80aa8 to your computer and use it in GitHub Desktop.
A Python script to scrape Google Scholar.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# # Scraping Google Scholar | |
# | |
# Sources: https://beautiful-soup-4.readthedocs.io/en/latest/ and https://requests.readthedocs.io/en/latest/ | |
# | |
# User Agents: Available at https://www.useragents.me/. | |
# You may need to change your user agent from time to time. | |
# Just copy and paste to reflect your set up! | |
# In[1]: | |
""" | |
Warning: This software is for research and educational purposes only. | |
Please do not violate any local or international laws. Please do not break terms of | |
service with other entities. Please read and understand the code before using it. | |
""" | |
__author__ = "Aaron Mauro" | |
__role__ = "researcher" | |
__institution__ = "Brock University" | |
__email__ = "amauro@brocku.ca" | |
__status__ = "prototype/experiment" | |
__version__ = "0.1" | |
# In[2]: | |
from bs4 import BeautifulSoup | |
import requests | |
import pandas as pd | |
# In[3]: | |
# enter search query | |
query = input("Search Google Scholar: ") | |
query = query.lower().split() | |
query = "".join(["source:"+term+"+" for term in query]).rstrip("+") # builds the query format for the url | |
# In[4]: | |
user_agent = input("Paste user agent text from above site: ") | |
#such as: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 | |
# In[5]: | |
#define headers to pretend to be a browser | |
headers = {"User-Agent":f"{user_agent}","referer":"https://www.google.com/"} | |
# In[6]: | |
#create empty soup container | |
all_soup = BeautifulSoup() | |
# In[7]: | |
#run the first page, which is different than subsequent pages... to thwart us... | |
url = f"https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={query}&btnG=&oq=First" | |
response=requests.get(url,headers=headers) | |
soup=BeautifulSoup(response.content,'lxml') | |
all_soup.append(soup) | |
# In[ ]: | |
#now run page after page, marked by 10, 20, 30, 40. These represent page 1, 2, 3, 4, etc. | |
#note that the first page is 0, which we completed above | |
page = input("Enter page range in increments of ten: ") | |
url = f"https://scholar.google.com/scholar?start={page}&?q={query}+&hl=en&as_sdt=0,5" | |
response=requests.get(url,headers=headers) | |
soup=BeautifulSoup(response.content,'lxml') | |
all_soup.append(soup) | |
# In[ ]: | |
all_soup | |
# In[10]: | |
sources = all_soup.findAll("div", {"class": "gs_ri"}) | |
# In[13]: | |
index = 0 | |
collection = {} | |
for source in sources: | |
entry = {} | |
author = source.find("div",{"class":"gs_a"}).get_text() | |
author_index = author.index("-") | |
author = author[:author_index].strip().split(",") | |
title = source.find("h3").get_text().replace("[PDF]","").replace("[HTML]","").replace("[CITATION][C]","").strip() | |
abstract = source.find("div",{"class":"gs_rs"}).get_text().strip() | |
citation = "".join([ch for ch in source.find("div", {"class":"gs_fl gs_flb"}).get_text() if ch.isdigit()]) | |
entry = {index:{"author":author,"title":title,"abstract":abstract,"citation":citation}} | |
index += 1 | |
collection.update(entry) | |
# In[14]: | |
df = pd.DataFrame.from_dict(collection, orient="index") | |
# In[15]: | |
df.shape | |
# In[16]: | |
df.to_csv("./out.csv", encoding='utf-8', index=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment