Last active
April 18, 2023 22:31
-
-
Save dimitryzub/30e5b4f446094251789ac13bc7c60fa3 to your computer and use it in GitHub Desktop.
Scrape Google Scholar Organic Results with Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests, lxml, os, json | |
headers = { | |
'User-agent': | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582" | |
} | |
proxies = { | |
'http': os.getenv('HTTP_PROXY') # or just type proxy here without os.getenv() | |
} | |
html = requests.get('https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=samsung&oq=', headers=headers, proxies=proxies).text | |
soup = BeautifulSoup(html, 'lxml') | |
# Scrape just PDF links | |
for pdf_link in soup.select('.gs_or_ggsm a'): | |
pdf_file_link = pdf_link['href'] | |
print(pdf_file_link) | |
# JSON data will be collected here | |
data = [] | |
# Container where all needed data is located | |
for result in soup.select('.gs_ri'): | |
title = result.select_one('.gs_rt').text | |
title_link = result.select_one('.gs_rt a')['href'] | |
publication_info = result.select_one('.gs_a').text | |
snippet = result.select_one('.gs_rs').text | |
cited_by = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href'] | |
related_articles = result.select_one('a:nth-child(4)')['href'] | |
try: | |
all_article_versions = result.select_one('a~ a+ .gs_nph')['href'] | |
except: | |
all_article_versions = None | |
data.append({ | |
'title': title, | |
'title_link': title_link, | |
'publication_info': publication_info, | |
'snippet': snippet, | |
'cited_by': f'https://scholar.google.com{cited_by}', | |
'related_articles': f'https://scholar.google.com{related_articles}', | |
'all_article_versions': f'https://scholar.google.com{all_article_versions}', | |
}) | |
print(json.dumps(data, indent = 2, ensure_ascii = False)) | |
# Part of the JSON Output: | |
''' | |
[ | |
{ | |
"title": "“What? I thought Samsung was Japanese”: accurate or not, perceived country of origin matters", | |
"title_link": "https://www.emerald.com/insight/content/doi/10.1108/02651331111167589/full/html", | |
"publication_info": "P Magnusson, SA Westjohn… - International Marketing …, 2011 - emerald.com", | |
"snippet": "Purpose–Extensive research has shown that country‐of‐origin (COO) information significantly affects product evaluations and buying behavior. Yet recently, a competing perspective has emerged suggesting that COO effects have been inflated in prior research …", | |
"cited_by": "https://scholar.google.com/scholar?cites=341074171610121811&as_sdt=2005&sciodt=0,5&hl=en", | |
"related_articles": "https://scholar.google.com/scholar?q=related:U8bh6Ca9uwQJ:scholar.google.com/&scioq=samsung&hl=en&as_sdt=0,5", | |
"all_article_versions": "https://scholar.google.com/scholar?cluster=341074171610121811&hl=en&as_sdt=0,5" | |
} | |
] | |
''' | |
# Part of PDF Links Output: | |
''' | |
https://www.researchgate.net/profile/Peter_Magnusson/publication/232614407_What_I_thought_Samsung_was_Japanese_Accurate_or_not_perceived_country_of_origin_matters/links/09e4150881184a6ad2000000/What-I-thought-Samsung-was-Japanese-Accurate-or-not-perceived-country-of-origin-matters.pdf | |
https://www.researchgate.net/profile/Hong_Mo_Yang/publication/235291000_Supply_chain_management_six_sigma_A_management_innovation_methodology_at_the_Samsung_Group/links/56e03d0708aec4b3333d0445.pdf | |
https://www.academia.edu/download/54053930/The_Strategic_Localization_of_Transnatio20170803-32468-4ntcqr.pdf | |
https://mathsci2.appstate.edu/~wmcb/Class/5340/ClassNotes141/EdelmanAwards/Interfaces2002-S.pdf | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment