Skip to content

Instantly share code, notes, and snippets.

@JoaoGFarias
Created July 16, 2017 18:48
Show Gist options
  • Save JoaoGFarias/b120c05c98a79cd3a6360f52ca3e9bdd to your computer and use it in GitHub Desktop.
Save JoaoGFarias/b120c05c98a79cd3a6360f52ca3e9bdd to your computer and use it in GitHub Desktop.
ScriptLattes Analysis
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import csv
import re
import string
import yaml
def save_new_authors(authors, file_title='new_authors_list.txt'):
authors.sort()
with open(file_title, mode='w+') as file_handler:
for item in authors:
file_handler.write("{}\n".format(item))
def save_authors(authors, file_title='new_authors.yml'):
with open(file_title, mode='w+') as file_handler:
yaml.dump(authors,file_handler)
def save_papers(data, new_iteration_file='new_iteration.yml'):
with open(new_iteration_file, mode='w+') as file_handler:
yaml.dump(data,file_handler)
def clean_string(text):
exclude = set(string.punctuation)
return re.sub(' +',' ',''.join(ch for ch in text if ch not in exclude)).lower()
def should_paper_pass(title, paper_list):
title = clean_string(title)
if clean_string(title) in paper_list:
return False
response = input('Paper title: ' + title + '\nResponse (y/n):')
return response == 'y'
def get_authors(file='all_authors.txt'):
try:
with open(file, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
return set([row[0] for row in reader])
except FileNotFoundError:
return set()
def loop_papers(driver, links):
all_authors = get_authors('all_authors.txt')
new_authors = []
paper_list = []
new_papers = {'papers':[]}
for link in links:
driver.get(link)
works = driver.find_elements_by_css_selector('tr')
for work in works:
title = work.find_element_by_css_selector('td:nth-of-type(2) b').text
#Escaping repeated works
if clean_string(title) in paper_list:
continue
try:
doi = work.find_element_by_css_selector('td:nth-of-type(2) a').get_attribute("href")
except NoSuchElementException:
continue
authors = work.find_element_by_css_selector('td:nth-of-type(2) div').text
if should_paper_pass(title, paper_list):
paper_list.append(clean_string(title))
work_authors = [x.strip() for x in authors.split(';')]
new_papers['papers'].append({
'status':"",
'title':clean_string(title),
'doi': doi,
'authors': work_authors})
new_authors.extend(work_authors)
authors_list = {'authors':[]}
#Excluding from already analyzed authors from the list of new authors
new_authors = set(new_authors).difference(all_authors)
for author in new_authors:
authors_list['authors'].append({'name':author,'papers':[]})
save_new_authors(list(new_authors))
for paper in new_papers['papers']:
for author in authors_list['authors']:
if author['name'] in paper['authors']:
author['papers'].append({'title':paper['title'],'doi':paper['doi']})
save_papers(new_papers)
save_authors(authors_list)
return driver
def start(links, path_to_phantomJs_executable):
driver = webdriver.PhantomJS(executable_path=path_to_phantomJs_executable)
driver = loop_papers(driver, links)
driver.quit()
#Fill in the links for the ScriptLattes report pages where the papers are displayed
links = ['']
#Fill in the path for the PhantomJS executable on your machine
path_to_phantomJs_executable = ''
start(links, path_to_phantomJs_executable)
@JoaoGFarias
Copy link
Author

JoaoGFarias commented Jul 16, 2017

ScriptLattes curricula: Download ZIP:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment