Skip to content

Instantly share code, notes, and snippets.

@tkovs
Last active March 26, 2017 22:01
Show Gist options
  • Save tkovs/58a93c4c5fc06968e8d41d247a229683 to your computer and use it in GitHub Desktop.
Save tkovs/58a93c4c5fc06968e8d41d247a229683 to your computer and use it in GitHub Desktop.
import scrapy
class ranking(scrapy.Spider):
name = 'ranking'
allowed_domains = ['yugiohrpgonline.com.br']
start_urls = ['https://www.yugiohrpgonline.com.br/clan/SET/ranking/week/']
def parse(self, response):
formdata = {
'login': 'user',
'password': 'pass',
'g-recaptcha-response': ''
}
# Linha abaixo retorna "Yu-Gi-Oh RPG Online - Efetuar Login"
self.logger.warning(response.xpath('//title/text()').extract_first())
return scrapy.FormRequest.from_response(
response,
formnumber = 0,
formdata = formdata,
callback = self.after_login
)
def after_login(self, response):
# Linha deveria retornar "Yu-Gi-Oh RPG Online - Ranking SET", mas retorna mesma coisa que o warning anterior
self.logger.warning(response.xpath('//title/text()').extract_first())
return
@kaihami
Copy link

kaihami commented Mar 26, 2017

from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

class MySpider(InitSpider):
    name = 'ranking'
    allowed_domains = ['yugiohrpgonline.com.br']
    login_page = 'http://www.domain.com/login'
    start_urls = ['https://www.yugiohrpgonline.com.br/clan/SET/ranking/week/']


    def init_request(self):
        """This function is called before crawling starts."""
        return Request(url=self.login_page, callback=self.login)

    def login(self, response):
        """Generate a login request."""
        return FormRequest.from_response(response,
                    formdata={'login': 'user', 'password': 'pass'},
                    callback=self.check_login_response)

    def check_login_response(self, response):
        """Check the response returned by a login request to see if we are
        successfully logged in.
        """
        if "bla bla bla" in response.body:
            self.log("Successfully logged in. Let's start crawling!")
            # Now the crawling can begin..
            self.initialized()
        else:
            self.log("Bad times :(")
            # Something went wrong, we couldn't log in, so nothing happens.

    def parse_item(self, response):

        # Scrape data from page

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment