Scrape Google Scholar Profile Results with Python
from bs4 import BeautifulSoup
import requests, lxml, os
headers = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
proxies = {
'http': os.getenv('HTTP_PROXY')
html = requests.get('', headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
# Selecting container where all data located
for result in'.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
id = link
id_identifer = 'user='
before_keyword, keyword, after_keyword = id.partition(id_identifer)
author_id = after_keyword
affiliations = result.select_one('.gs_ai_aff').text
email = result.select_one('.gs_ai_eml').text
interests = result.select_one('.gs_ai_one_int').text
interests = None
# "Cited by 107390" = getting text string -> splitting by a space -> ['Cited', 'by', '21180'] and taking [2] index which is the number.
cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
# Part of the output:
Jeong-Won Lee
Samsung Medical Center
Verified email at
Gynecologic oncology
