Created
July 16, 2017 18:48
-
-
Save JoaoGFarias/b120c05c98a79cd3a6360f52ca3e9bdd to your computer and use it in GitHub Desktop.
ScriptLattes Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.common.exceptions import NoSuchElementException | |
import csv | |
import re | |
import string | |
import yaml | |
def save_new_authors(authors, file_title='new_authors_list.txt'): | |
authors.sort() | |
with open(file_title, mode='w+') as file_handler: | |
for item in authors: | |
file_handler.write("{}\n".format(item)) | |
def save_authors(authors, file_title='new_authors.yml'): | |
with open(file_title, mode='w+') as file_handler: | |
yaml.dump(authors,file_handler) | |
def save_papers(data, new_iteration_file='new_iteration.yml'): | |
with open(new_iteration_file, mode='w+') as file_handler: | |
yaml.dump(data,file_handler) | |
def clean_string(text): | |
exclude = set(string.punctuation) | |
return re.sub(' +',' ',''.join(ch for ch in text if ch not in exclude)).lower() | |
def should_paper_pass(title, paper_list): | |
title = clean_string(title) | |
if clean_string(title) in paper_list: | |
return False | |
response = input('Paper title: ' + title + '\nResponse (y/n):') | |
return response == 'y' | |
def get_authors(file='all_authors.txt'): | |
try: | |
with open(file, newline='') as csvfile: | |
reader = csv.reader(csvfile, delimiter=';', quotechar='|') | |
return set([row[0] for row in reader]) | |
except FileNotFoundError: | |
return set() | |
def loop_papers(driver, links): | |
all_authors = get_authors('all_authors.txt') | |
new_authors = [] | |
paper_list = [] | |
new_papers = {'papers':[]} | |
for link in links: | |
driver.get(link) | |
works = driver.find_elements_by_css_selector('tr') | |
for work in works: | |
title = work.find_element_by_css_selector('td:nth-of-type(2) b').text | |
#Escaping repeated works | |
if clean_string(title) in paper_list: | |
continue | |
try: | |
doi = work.find_element_by_css_selector('td:nth-of-type(2) a').get_attribute("href") | |
except NoSuchElementException: | |
continue | |
authors = work.find_element_by_css_selector('td:nth-of-type(2) div').text | |
if should_paper_pass(title, paper_list): | |
paper_list.append(clean_string(title)) | |
work_authors = [x.strip() for x in authors.split(';')] | |
new_papers['papers'].append({ | |
'status':"", | |
'title':clean_string(title), | |
'doi': doi, | |
'authors': work_authors}) | |
new_authors.extend(work_authors) | |
authors_list = {'authors':[]} | |
#Excluding from already analyzed authors from the list of new authors | |
new_authors = set(new_authors).difference(all_authors) | |
for author in new_authors: | |
authors_list['authors'].append({'name':author,'papers':[]}) | |
save_new_authors(list(new_authors)) | |
for paper in new_papers['papers']: | |
for author in authors_list['authors']: | |
if author['name'] in paper['authors']: | |
author['papers'].append({'title':paper['title'],'doi':paper['doi']}) | |
save_papers(new_papers) | |
save_authors(authors_list) | |
return driver | |
def start(links, path_to_phantomJs_executable): | |
driver = webdriver.PhantomJS(executable_path=path_to_phantomJs_executable) | |
driver = loop_papers(driver, links) | |
driver.quit() | |
#Fill in the links for the ScriptLattes report pages where the papers are displayed | |
links = [''] | |
#Fill in the path for the PhantomJS executable on your machine | |
path_to_phantomJs_executable = '' | |
start(links, path_to_phantomJs_executable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ScriptLattes curricula: Download ZIP: