Created
January 7, 2022 03:25
-
-
Save Lucs1590/48993772e6fb16c704d3f97f2c5f1629 to your computer and use it in GitHub Desktop.
Script to get Articles from google scholar, browsing through all the pages of the searched topic.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
import time | |
from lxml import html | |
title_list = [] | |
author_list = [] | |
def get_authors(element): | |
authors = ','.join( | |
list(map(lambda author: author.text_content(), element)) | |
) | |
if not authors: | |
nonBreakSpace = u'\xa0' | |
authors = element.text_content().split(nonBreakSpace)[0] | |
return authors | |
for page in range(10, 720, 10): | |
page_content = requests.get( | |
f'https://scholar.google.com/scholar?start={str(page)}&q=(odonto+OR+dentistry)+AND+deep+learning+AND+(semantic+segmentation+AND+instance+segmentation+OR+panoptic+segmentation)+AND+CNN&hl=pt-BR&as_sdt=2007&as_ylo=2019&as_yhi=2022') | |
tree = html.fromstring(page_content.content) | |
titles = tree.xpath('//div[@class="gs_ri"]/h3/a') | |
authors = tree.xpath('//div[@class="gs_a"]') | |
title_list += list(map(lambda x: x.text_content(), titles)) | |
author_list += list(map(lambda x: get_authors(x), authors)) | |
time.sleep(5) | |
rows = zip(title_list, author_list) | |
with open('articles.csv', "w") as f: | |
writer = csv.writer(f) | |
for row in rows: | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment