Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scraping the Partisan Divide: My Scrapy Spider
# Defined Number of Threads -> Multiple Pages per Thread -> Multiple posts per page
from scrapy import Spider, Request
from forum.items import ForumItem
import re
class ForumSpider(Spider):
name = "forum_spider"
allowed_url = ['https://liberalforum.net']
start_urls = ['https://liberalforum.net/viewtopic.php?t=37200']
#load start page for for each thread:
def parse(self, response):
numThreads = ['https://liberalforum.net/viewtopic.php?t={}'.format(x) for x in range(14493,37211)]
# desired start = When Trump announced Presidency: 14493
for thread in numThreads:
yield Request(url = thread, callback = self.parse_within_thread)
#load each page of each thread:
def parse_within_thread(self,response):
# different xpath for when only one thread page and when thread has multiple pages...
if response.xpath('//*[@id="page-body"]/div[1]/div[1]/div[3]/div[2]/a/strong[2]/text()').extract_first() is None:
threadMaxPage = 1
else:
threadMaxPage = response.xpath('//*[@id="page-body"]/div[1]/div[1]/div[3]/div[2]/a/strong[2]/text()').extract_first()
numPages =['https://liberalforum.net/viewtopic.php?t={}&start={}'.format(x,y) for x in range(14493,37211) for y in range(0,20*(int(threadMaxPage)-1),20)]
for page in numPages:
yield Request(url = page, callback=self.parse_posts_within_thread_page)
#parse components of each post of each page in each thread
def parse_posts_within_thread_page(self, response):
posts = response.xpath('//div[@class="vtinner"]')
for post in posts:
threadReplies = response.xpath('//*[@id="page-body"]/div[1]/div[1]/div[6]/div/div/span[1]/text()').extract_first()
threadTitle = response.xpath('//*[@id="page-body"]/div[1]/h2/a/span/text()').extract_first()
userName = post.xpath('div[3]/div/span/a/text()').extract_first()
userPosts = post.xpath('div[3]/div[2]/a/text()').extract_first()
userPolitics = post.xpath('div[3]/div[2]/a[2]/text()').extract # <- Issue with spacing/newline characters
userMoney = post.xpath('div[3]/div[2]/a[3]/span/text()').extract_first()
postDateTime = post.xpath('./div[2]/div[2]/span/a/text()').extract_first()
postText = post.xpath('div[4]/div[@class="content"]/text()').extract() # <- Issue with spacing/newline characters
item = ForumItem()
item['threadReplies'] = threadReplies
item['threadTitle'] = threadTitle
item['userName'] = userName
item['userPosts'] = userPosts
item['userPolitics'] = userPolitics
item['userMoney'] = userMoney
item['postDateTime'] = postDateTime
item['postText'] = postText
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ForumItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
threadReplies = scrapy.Field()
threadTitle = scrapy.Field()
userName = scrapy.Field()
userPosts = scrapy.Field()
userPolitics = scrapy.Field()
userMoney = scrapy.Field()
postDateTime = scrapy.Field()
postText = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import CsvItemExporter
class ForumPipeline(object):
def __init__(self):
self.filename = 'forum.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
# -*- coding: utf-8 -*-
# Scrapy settings for forum project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'forum'
SPIDER_MODULES = ['forum.spiders']
NEWSPIDER_MODULE = 'forum.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'forum (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'forum.middlewares.ForumSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'forum.middlewares.ForumDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'forum.pipelines.ForumPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment