Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrape Google Scholar Profile Results with Python
from bs4 import BeautifulSoup
import requests, lxml, os
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
proxies = {
'http': os.getenv('HTTP_PROXY')
}
html = requests.get('https://scholar.google.com/citations?view_op=view_org&hl=en&org=9834965952280547731', headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
# Selecting container where all data located
for result in soup.select('.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
# https://stackoverflow.com/a/6633693/15164646
id = link
id_identifer = 'user='
before_keyword, keyword, after_keyword = id.partition(id_identifer)
author_id = after_keyword
affiliations = result.select_one('.gs_ai_aff').text
email = result.select_one('.gs_ai_eml').text
try:
interests = result.select_one('.gs_ai_one_int').text
except:
interests = None
# "Cited by 107390" = getting text string -> splitting by a space -> ['Cited', 'by', '21180'] and taking [2] index which is the number.
cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
print(f'{name}\nhttps://scholar.google.com{link}\n{author_id}\n{affiliations}\n{email}\n{interests}\n{cited_by}\n')
# Part of the output:
'''
Jeong-Won Lee
https://scholar.google.com/citations?hl=en&user=D41VK7AAAAAJ
D41VK7AAAAAJ
Samsung Medical Center
Verified email at samsung.com
Gynecologic oncology
107516
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment