JoaoGFarias/select_works.py

## select_works.py
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import csv
import re
import string
import yaml

def save_new_authors(authors, file_title='new_authors_list.txt'):
    authors.sort()
    with open(file_title, mode='w+') as file_handler:
        for item in authors:
            file_handler.write("{}\n".format(item))

def save_authors(authors, file_title='new_authors.yml'):
    with open(file_title, mode='w+') as file_handler:
        yaml.dump(authors,file_handler)

def save_papers(data, new_iteration_file='new_iteration.yml'):
    with open(new_iteration_file, mode='w+') as file_handler:
        yaml.dump(data,file_handler)


def clean_string(text):
    exclude = set(string.punctuation)
    return re.sub(' +',' ',''.join(ch for ch in text if ch not in exclude)).lower()

def should_paper_pass(title, paper_list):
    title = clean_string(title)

    if clean_string(title) in paper_list:
        return False

    response = input('Paper title: ' + title + '\nResponse (y/n):')
    return response == 'y'

def get_authors(file='all_authors.txt'):
    try:
        with open(file, newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=';', quotechar='|')
            return set([row[0] for row in reader])
    except FileNotFoundError:
        return set()

def loop_papers(driver, links):
    all_authors = get_authors('all_authors.txt')
    new_authors = []
    paper_list = []
    new_papers = {'papers':[]}
    for link in links:
        driver.get(link)
        works = driver.find_elements_by_css_selector('tr')
        for work in works:

            title = work.find_element_by_css_selector('td:nth-of-type(2) b').text

            #Escaping repeated works
            if clean_string(title) in paper_list:
                continue

            try:
                doi = work.find_element_by_css_selector('td:nth-of-type(2) a').get_attribute("href")
            except NoSuchElementException:
                continue

            authors = work.find_element_by_css_selector('td:nth-of-type(2) div').text

            if should_paper_pass(title, paper_list):
                paper_list.append(clean_string(title))
                work_authors = [x.strip() for x in authors.split(';')]
                new_papers['papers'].append({
                    'status':"",
                    'title':clean_string(title),
                    'doi': doi,
                    'authors': work_authors})
                new_authors.extend(work_authors)

    authors_list = {'authors':[]}

    #Excluding from already analyzed authors from the list of new authors
    new_authors = set(new_authors).difference(all_authors)

    for author in new_authors:
        authors_list['authors'].append({'name':author,'papers':[]})

    save_new_authors(list(new_authors))

    for paper in new_papers['papers']:
        for author in authors_list['authors']:
            if author['name'] in paper['authors']:
                author['papers'].append({'title':paper['title'],'doi':paper['doi']})

    save_papers(new_papers)
    save_authors(authors_list)
    return driver

def start(links, path_to_phantomJs_executable):
    driver = webdriver.PhantomJS(executable_path=path_to_phantomJs_executable)
    driver = loop_papers(driver, links)
    driver.quit()

#Fill in the links for the ScriptLattes report pages where the papers are displayed
links = ['']

#Fill in the path for the PhantomJS executable on your machine
path_to_phantomJs_executable = ''

start(links, path_to_phantomJs_executable)
	from selenium import webdriver
	from selenium.common.exceptions import NoSuchElementException
	import csv
	import re
	import string
	import yaml

	def save_new_authors(authors, file_title='new_authors_list.txt'):
	authors.sort()
	with open(file_title, mode='w+') as file_handler:
	for item in authors:
	file_handler.write("{}\n".format(item))

	def save_authors(authors, file_title='new_authors.yml'):
	with open(file_title, mode='w+') as file_handler:
	yaml.dump(authors,file_handler)

	def save_papers(data, new_iteration_file='new_iteration.yml'):
	with open(new_iteration_file, mode='w+') as file_handler:
	yaml.dump(data,file_handler)


	def clean_string(text):
	exclude = set(string.punctuation)
	return re.sub(' +',' ',''.join(ch for ch in text if ch not in exclude)).lower()

	def should_paper_pass(title, paper_list):
	title = clean_string(title)

	if clean_string(title) in paper_list:
	return False

	response = input('Paper title: ' + title + '\nResponse (y/n):')
	return response == 'y'

	def get_authors(file='all_authors.txt'):
	try:
	with open(file, newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=';', quotechar='\|')
	return set([row[0] for row in reader])
	except FileNotFoundError:
	return set()

	def loop_papers(driver, links):
	all_authors = get_authors('all_authors.txt')
	new_authors = []
	paper_list = []
	new_papers = {'papers':[]}
	for link in links:
	driver.get(link)
	works = driver.find_elements_by_css_selector('tr')
	for work in works:

	title = work.find_element_by_css_selector('td:nth-of-type(2) b').text

	#Escaping repeated works
	if clean_string(title) in paper_list:
	continue

	try:
	doi = work.find_element_by_css_selector('td:nth-of-type(2) a').get_attribute("href")
	except NoSuchElementException:
	continue

	authors = work.find_element_by_css_selector('td:nth-of-type(2) div').text

	if should_paper_pass(title, paper_list):
	paper_list.append(clean_string(title))
	work_authors = [x.strip() for x in authors.split(';')]
	new_papers['papers'].append({
	'status':"",
	'title':clean_string(title),
	'doi': doi,
	'authors': work_authors})
	new_authors.extend(work_authors)

	authors_list = {'authors':[]}

	#Excluding from already analyzed authors from the list of new authors
	new_authors = set(new_authors).difference(all_authors)

	for author in new_authors:
	authors_list['authors'].append({'name':author,'papers':[]})

	save_new_authors(list(new_authors))

	for paper in new_papers['papers']:
	for author in authors_list['authors']:
	if author['name'] in paper['authors']:
	author['papers'].append({'title':paper['title'],'doi':paper['doi']})

	save_papers(new_papers)
	save_authors(authors_list)
	return driver

	def start(links, path_to_phantomJs_executable):
	driver = webdriver.PhantomJS(executable_path=path_to_phantomJs_executable)
	driver = loop_papers(driver, links)
	driver.quit()

	#Fill in the links for the ScriptLattes report pages where the papers are displayed
	links = ['']

	#Fill in the path for the PhantomJS executable on your machine
	path_to_phantomJs_executable = ''

	start(links, path_to_phantomJs_executable)