Skip to content

Instantly share code, notes, and snippets.

@seagatesoft
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seagatesoft/26ebc1b5f97887aa549c to your computer and use it in GitHub Desktop.
Save seagatesoft/26ebc1b5f97887aa549c to your computer and use it in GitHub Desktop.
A simplified example of Javascript challenge page and how the spider handle it
<html>
<head>
<meta http-equiv="Pragma" content="no-cache"/>
<meta http-equiv="Expires" content="-1"/>
<meta http-equiv="CacheControl" content="no-cache"/>
<script type="text/javascript">
function challenge() {
var x = 17;
var y = 25;
var challenge_answer = (x * y) + 9;
document.forms[0].elements[0].value = challenge_answer;
document.forms[0].submit();
}
</script>
<noscript>
Please enable JavaScript to view the page content.
</noscript>
</head>
<body onload="challenge()">
<form method="POST" action="/answer-challenge">
<input type="hidden" name="challenge_answer" value="0"/>
</form>
</body>
</html>
import re
import urllib
import urlparse
from scrapy.http import Request
from scrapy.spider import Spider
class ProjectSpider(Spider):
name = 'project-website.com'
start_urls = ['project-website.com']
def parse(self, response):
is_challenge_page = response.xpath('/html/body[@onload="challenge()"]')
if is_challenge_page:
return self.parse_challenge_page(response)
else:
# process requested page here
def parse_challenge_page(self, response):
form_action = response.xpath('/html/body/form/@action').extract()[0]
form_url = urlparse.urljoin(response.url, form_action)
script = response.xpath('/html/head/script/text()').extract()[0]
x_re = re.compile(r'var x = (\d+);')
y_re = re.compile(r'var y = (\d+);')
addition_re = re.compile(r'var challenge_answer = \(x \* y\) \+ (\d+)')
x = int(x_re.search(script).group(1))
y = int(y_re.search(script).group(1))
addition = int(addition_re.search(script).group(1))
challenge_answer = (x * y) + addition
input_name = response.xpath('/html/body/form/input/@name').extract()[0]
body = '%s=%s' % (urllib.quote(input_name), urllib.quote(challenge_answer))
return Request(
form_url,
method='POST',
body=body,
callback=self.parse
)
COOKIES_ENABLED = False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment