aaronmauro/scholar_data.py

## scholar_data.py
#!/usr/bin/env python
# coding: utf-8

# # Scraping Google Scholar
#
# Sources: https://beautiful-soup-4.readthedocs.io/en/latest/ and https://requests.readthedocs.io/en/latest/
#
# User Agents: Available at https://www.useragents.me/.
# You may need to change your user agent from time to time.
# Just copy and paste to reflect your set up!

# In[1]:

"""
Warning: This software is for research and educational purposes only.
Please do not violate any local or international laws. Please do not break terms of
service with other entities. Please read and understand the code before using it.
"""
__author__ = "Aaron Mauro"
__role__ = "researcher"
__institution__ = "Brock University"
__email__ = "amauro@brocku.ca"
__status__ = "prototype/experiment"
__version__ = "0.1"


# In[2]:


from bs4 import BeautifulSoup
import requests
import pandas as pd


# In[3]:


# enter search query
query = input("Search Google Scholar: ")
query = query.lower().split()
query = "".join(["source:"+term+"+" for term in query]).rstrip("+") # builds the query format for the url


# In[4]:


user_agent = input("Paste user agent text from above site: ")
#such as: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36


# In[5]:


#define headers to pretend to be a browser
headers = {"User-Agent":f"{user_agent}","referer":"https://www.google.com/"}


# In[6]:


#create empty soup container
all_soup = BeautifulSoup()


# In[7]:


#run the first page, which is different than subsequent pages... to thwart us...
url = f"https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={query}&btnG=&oq=First"
response=requests.get(url,headers=headers)
soup=BeautifulSoup(response.content,'lxml')
all_soup.append(soup)


# In[ ]:


#now run page after page, marked by 10, 20, 30, 40. These represent page 1, 2, 3, 4, etc.
#note that the first page is 0, which we completed above
page = input("Enter page range in increments of ten: ")
url = f"https://scholar.google.com/scholar?start={page}&?q={query}+&hl=en&as_sdt=0,5"
response=requests.get(url,headers=headers)
soup=BeautifulSoup(response.content,'lxml')
all_soup.append(soup)


# In[ ]:


all_soup


# In[10]:


sources = all_soup.findAll("div", {"class": "gs_ri"})


# In[13]:


index = 0
collection = {}
for source in sources:
    entry = {}
    author = source.find("div",{"class":"gs_a"}).get_text()
    author_index = author.index("-")
    author = author[:author_index].strip().split(",")
    title = source.find("h3").get_text().replace("[PDF]","").replace("[HTML]","").replace("[CITATION][C]","").strip()
    abstract = source.find("div",{"class":"gs_rs"}).get_text().strip()
    citation = "".join([ch for ch in source.find("div", {"class":"gs_fl gs_flb"}).get_text() if ch.isdigit()])
    entry = {index:{"author":author,"title":title,"abstract":abstract,"citation":citation}}
    index += 1
    collection.update(entry)


# In[14]:


df = pd.DataFrame.from_dict(collection, orient="index")


# In[15]:


df.shape


# In[16]:


df.to_csv("./out.csv", encoding='utf-8', index=False)
	#!/usr/bin/env python
	# coding: utf-8

	# # Scraping Google Scholar
	#
	# Sources: https://beautiful-soup-4.readthedocs.io/en/latest/ and https://requests.readthedocs.io/en/latest/
	#
	# User Agents: Available at https://www.useragents.me/.
	# You may need to change your user agent from time to time.
	# Just copy and paste to reflect your set up!

	# In[1]:

	"""
	Warning: This software is for research and educational purposes only.
	Please do not violate any local or international laws. Please do not break terms of
	service with other entities. Please read and understand the code before using it.
	"""
	__author__ = "Aaron Mauro"
	__role__ = "researcher"
	__institution__ = "Brock University"
	__email__ = "amauro@brocku.ca"
	__status__ = "prototype/experiment"
	__version__ = "0.1"


	# In[2]:


	from bs4 import BeautifulSoup
	import requests
	import pandas as pd


	# In[3]:


	# enter search query
	query = input("Search Google Scholar: ")
	query = query.lower().split()
	query = "".join(["source:"+term+"+" for term in query]).rstrip("+") # builds the query format for the url


	# In[4]:


	user_agent = input("Paste user agent text from above site: ")
	#such as: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36


	# In[5]:


	#define headers to pretend to be a browser
	headers = {"User-Agent":f"{user_agent}","referer":"https://www.google.com/"}


	# In[6]:


	#create empty soup container
	all_soup = BeautifulSoup()


	# In[7]:


	#run the first page, which is different than subsequent pages... to thwart us...
	url = f"https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={query}&btnG=&oq=First"
	response=requests.get(url,headers=headers)
	soup=BeautifulSoup(response.content,'lxml')
	all_soup.append(soup)


	# In[ ]:


	#now run page after page, marked by 10, 20, 30, 40. These represent page 1, 2, 3, 4, etc.
	#note that the first page is 0, which we completed above
	page = input("Enter page range in increments of ten: ")
	url = f"https://scholar.google.com/scholar?start={page}&?q={query}+&hl=en&as_sdt=0,5"
	response=requests.get(url,headers=headers)
	soup=BeautifulSoup(response.content,'lxml')
	all_soup.append(soup)


	# In[ ]:


	all_soup


	# In[10]:


	sources = all_soup.findAll("div", {"class": "gs_ri"})


	# In[13]:


	index = 0
	collection = {}
	for source in sources:
	entry = {}
	author = source.find("div",{"class":"gs_a"}).get_text()
	author_index = author.index("-")
	author = author[:author_index].strip().split(",")
	title = source.find("h3").get_text().replace("[PDF]","").replace("[HTML]","").replace("[CITATION][C]","").strip()
	abstract = source.find("div",{"class":"gs_rs"}).get_text().strip()
	citation = "".join([ch for ch in source.find("div", {"class":"gs_fl gs_flb"}).get_text() if ch.isdigit()])
	entry = {index:{"author":author,"title":title,"abstract":abstract,"citation":citation}}
	index += 1
	collection.update(entry)


	# In[14]:


	df = pd.DataFrame.from_dict(collection, orient="index")


	# In[15]:


	df.shape


	# In[16]:


	df.to_csv("./out.csv", encoding='utf-8', index=False)