-
-
Save denten/d898a48bcf2a496c7c07 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ipdb import set_trace as _BREAK | |
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.selector import HtmlXPathSelector | |
from scrapy.http import Request | |
from mobileread.items import PageItem, ThreadItem | |
class ForumSpider(CrawlSpider): | |
name = 'mobileread' | |
allowed_domains = ['www.mobileread.com'] | |
# testing a page with 9 threads | |
start_urls = ['http://www.mobileread.com/forums/forumdisplay.php?f=230&order=desc&page=2'] | |
# set rules for crawling next page links only | |
# first follow forum's page next links | |
# %timeit to time xpath selection--this was the fastest | |
rules = ( | |
Rule(SgmlLinkExtractor(restrict_xpaths="//div[@class='threadlink condensed']"), callback='parse_threads'), | |
) | |
def parse_threads(self, response): | |
thread = HtmlXPathSelector(response) | |
# get the list of posts | |
posts = thread.select("//div[@id='posts']//table[contains(@id,'post')]/*") | |
plist = [] | |
for p in posts: | |
table = ThreadItem() | |
# getting user_name | |
table['user_name'] = (p.select("//div[contains(@id, table['post_id'])]/a[@class='bigusername']/text()").extract())[0].strip() | |
# skip adverts | |
if 'Advertisement' in table['user_name']: | |
continue | |
# getting thread_id | |
# xpath returns a list of one element so we take the first and strip it | |
table['thread_id'] = (p.select("//input[@name='searchthreadid']/@value").extract())[0].strip() | |
# getting the post_id | |
string_id = p.select("../@id").extract() # returns a list | |
p_id = string_id[0].split("post") | |
table['post_id'] = p_id[1] | |
# Testing without yield | |
plist.append(table) | |
return plist | |
# yield table |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment