Created
August 29, 2012 19:21
-
-
Save bernardobarreto/3517519 to your computer and use it in GitHub Desktop.
a crawler to help abed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from splinter import Browser | |
class Abed(object): | |
def save_name(self, name): | |
self.open_file.write(name.encode('utf-8') + '\n') | |
def prepare(self): | |
self.browser = Browser() | |
def kill_browser(self): | |
self.browser.quit() | |
def open(self): | |
self.open_file = open('nomes_%s.txt' % self.sex, 'w') | |
def close(self): | |
self.open_file.close() | |
def how_many_pages(self): | |
if self.sex == 'femininos': | |
return 129 | |
return 178 | |
def extract(self, sex='masculinos'): | |
self.sex = sex | |
self.open() | |
self.prepare() | |
url = 'http://www.dicionariodenomesproprios.com.br/nomes-%s/' % self.sex | |
pages_num = self.how_many_pages() | |
for i in range(1, pages_num): | |
self.browser.visit(url + str(i)) | |
elems = self.browser.find_by_css('.box-list dt') | |
for elem in elems: | |
self.save_name(elem.text) | |
self.kill_browser() | |
self.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment