Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimitryzub/7f47a6bfd45f56cb6406024539843302 to your computer and use it in GitHub Desktop.
Save dimitryzub/7f47a6bfd45f56cb6406024539843302 to your computer and use it in GitHub Desktop.
Scrape Google Scholar Profile-Author Results with Python
from bs4 import BeautifulSoup
import requests, lxml, os, json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
proxies = {
'http': os.getenv('HTTP_PROXY')
}
def bs4_scrape_profile_results():
html = requests.get('https://scholar.google.com/citations?view_op=view_org&hl=en&org=9834965952280547731', headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
author_ids = []
for result in soup.select('.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
# https://stackoverflow.com/a/6633693/15164646
id = link
id_identifer = 'user='
before_keyword, keyword, after_keyword = id.partition(id_identifer)
author_id = after_keyword
affiliations = result.select_one('.gs_ai_aff').text
email = result.select_one('.gs_ai_eml').text
try:
interests = result.select_one('.gs_ai_one_int').text
except:
interests = None
cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
author_ids.append(author_id)
print(author_ids)
return author_ids
def bs4_scrape_author_result(profiles):
for id in profiles:
html = requests.get(f'https://scholar.google.com/citations?hl=en&user={id}', headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
# Author info
name = soup.select_one('#gsc_prf_in').text
affiliation = soup.select_one('#gsc_prf_in+ .gsc_prf_il').text
try:
email = soup.select_one('#gsc_prf_ivh').text
except:
email = None
try:
interests = soup.select_one('#gsc_prf_int').text
except:
interests = None
print('Author info:')
print(f'{name}\n{affiliation}\n{email}\n{interests}\n')
# Article info
for article_info in soup.select('#gsc_a_b .gsc_a_t'):
title = article_info.select_one('.gsc_a_at').text
title_link = article_info.select_one('.gsc_a_at')['data-href']
authors = article_info.select_one('.gsc_a_at+ .gs_gray').text
publications = article_info.select_one('.gs_gray+ .gs_gray').text
print('Article info:')
print(f'Title: {title}\nTitle link: https://scholar.google.com{title_link}\Article Author(s): {authors}\Article Publication(s): {publications}\n')
# Cited by and Public Access Info:
for cited_by_public_access in soup.select('.gsc_rsb'):
citations_all = cited_by_public_access.select_one('tr:nth-child(1) .gsc_rsb_sc1+ .gsc_rsb_std').text
citations_since2016 = cited_by_public_access.select_one('tr:nth-child(1) .gsc_rsb_std+ .gsc_rsb_std').text
h_index_all = cited_by_public_access.select_one('tr:nth-child(2) .gsc_rsb_sc1+ .gsc_rsb_std').text
h_index_2016 = cited_by_public_access.select_one('tr:nth-child(2) .gsc_rsb_std+ .gsc_rsb_std').text
i10_index_all = cited_by_public_access.select_one('tr~ tr+ tr .gsc_rsb_sc1+ .gsc_rsb_std').text
i10_index_2016 = cited_by_public_access.select_one('tr~ tr+ tr .gsc_rsb_std+ .gsc_rsb_std').text
articles_num = cited_by_public_access.select_one('.gsc_rsb_m_a:nth-child(1) span').text.split(' ')[0]
articles_link = cited_by_public_access.select_one('#gsc_lwp_mndt_lnk')['href']
print('Citiation info:')
print(f'{citations_all}\n{citations_since2016}\n{h_index_all}\n{h_index_2016}\n{i10_index_all}\n{i10_index_2016}\n{articles_num}\nhttps://scholar.google.com{articles_link}\n')
# Co-Authors
try:
for container in soup.select('.gsc_rsb_aa'):
author_name = container.select_one('#gsc_rsb_co a').text
author_affiliations = container.select_one('.gsc_rsb_a_ext').text
author_link = container.select_one('#gsc_rsb_co a')['href']
print('Co-Author(s):')
print(f'{author_name}\n{author_affiliations}\nhttps://scholar.google.com{author_link}\n')
except:
pass
# Graph results
years = [graph_year.text for graph_year in soup.select('.gsc_g_t')]
citations = [graph_citation.text for graph_citation in soup.select('.gsc_g_a')]
data = []
for year, citation in zip(years,citations):
print(f'{year} {citation}\n')
data.append({
'year': year,
'citation': citation,
})
# print(json.dumps(data, indent=2))
profiles = bs4_scrape_profile_results()
bs4_scrape_author_result(profiles)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment