Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Scrape Google Scholar Profile Results with Python
from bs4 import BeautifulSoup
import requests, lxml, os
headers = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
proxies = {
'http': os.getenv('HTTP_PROXY')
html = requests.get('', headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
# Selecting container where all data located
for result in'.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
id = link
id_identifer = 'user='
before_keyword, keyword, after_keyword = id.partition(id_identifer)
author_id = after_keyword
affiliations = result.select_one('.gs_ai_aff').text
email = result.select_one('.gs_ai_eml').text
interests = result.select_one('.gs_ai_one_int').text
interests = None
# "Cited by 107390" = getting text string -> splitting by a space -> ['Cited', 'by', '21180'] and taking [2] index which is the number.
cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
# Part of the output:
Jeong-Won Lee
Samsung Medical Center
Verified email at
Gynecologic oncology
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment