Skip to content

Instantly share code, notes, and snippets.

@dimitryzub
Last active April 18, 2023 22:31
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimitryzub/30e5b4f446094251789ac13bc7c60fa3 to your computer and use it in GitHub Desktop.
Save dimitryzub/30e5b4f446094251789ac13bc7c60fa3 to your computer and use it in GitHub Desktop.
Scrape Google Scholar Organic Results with Python
from bs4 import BeautifulSoup
import requests, lxml, os, json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
proxies = {
'http': os.getenv('HTTP_PROXY') # or just type proxy here without os.getenv()
}
html = requests.get('https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=samsung&oq=', headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
# Scrape just PDF links
for pdf_link in soup.select('.gs_or_ggsm a'):
pdf_file_link = pdf_link['href']
print(pdf_file_link)
# JSON data will be collected here
data = []
# Container where all needed data is located
for result in soup.select('.gs_ri'):
title = result.select_one('.gs_rt').text
title_link = result.select_one('.gs_rt a')['href']
publication_info = result.select_one('.gs_a').text
snippet = result.select_one('.gs_rs').text
cited_by = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href']
related_articles = result.select_one('a:nth-child(4)')['href']
try:
all_article_versions = result.select_one('a~ a+ .gs_nph')['href']
except:
all_article_versions = None
data.append({
'title': title,
'title_link': title_link,
'publication_info': publication_info,
'snippet': snippet,
'cited_by': f'https://scholar.google.com{cited_by}',
'related_articles': f'https://scholar.google.com{related_articles}',
'all_article_versions': f'https://scholar.google.com{all_article_versions}',
})
print(json.dumps(data, indent = 2, ensure_ascii = False))
# Part of the JSON Output:
'''
[
{
"title": "“What? I thought Samsung was Japanese”: accurate or not, perceived country of origin matters",
"title_link": "https://www.emerald.com/insight/content/doi/10.1108/02651331111167589/full/html",
"publication_info": "P Magnusson, SA Westjohn… - International Marketing …, 2011 - emerald.com",
"snippet": "Purpose–Extensive research has shown that country‐of‐origin (COO) information significantly affects product evaluations and buying behavior. Yet recently, a competing perspective has emerged suggesting that COO effects have been inflated in prior research …",
"cited_by": "https://scholar.google.com/scholar?cites=341074171610121811&as_sdt=2005&sciodt=0,5&hl=en",
"related_articles": "https://scholar.google.com/scholar?q=related:U8bh6Ca9uwQJ:scholar.google.com/&scioq=samsung&hl=en&as_sdt=0,5",
"all_article_versions": "https://scholar.google.com/scholar?cluster=341074171610121811&hl=en&as_sdt=0,5"
}
]
'''
# Part of PDF Links Output:
'''
https://www.researchgate.net/profile/Peter_Magnusson/publication/232614407_What_I_thought_Samsung_was_Japanese_Accurate_or_not_perceived_country_of_origin_matters/links/09e4150881184a6ad2000000/What-I-thought-Samsung-was-Japanese-Accurate-or-not-perceived-country-of-origin-matters.pdf
https://www.researchgate.net/profile/Hong_Mo_Yang/publication/235291000_Supply_chain_management_six_sigma_A_management_innovation_methodology_at_the_Samsung_Group/links/56e03d0708aec4b3333d0445.pdf
https://www.academia.edu/download/54053930/The_Strategic_Localization_of_Transnatio20170803-32468-4ntcqr.pdf
https://mathsci2.appstate.edu/~wmcb/Class/5340/ClassNotes141/EdelmanAwards/Interfaces2002-S.pdf
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment