Skip to content

Instantly share code, notes, and snippets.

@dkhurana1306
Created August 26, 2016 23:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dkhurana1306/d5d394421441f42f625ceac93b122978 to your computer and use it in GitHub Desktop.
Save dkhurana1306/d5d394421441f42f625ceac93b122978 to your computer and use it in GitHub Desktop.
import scrapy
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
start_urls = [
#"https://www.gofundme.com/Competing-in-Rio/",
"https://www.gofundme.com/Medical-Illness-Healing/",
"https://www.gofundme.com/Volunteer-Service/",
"https://www.gofundme.com/Education-Schools-Learning/",
"https://www.gofundme.com/Accidents-Personal-Crisis/",
"https://www.gofundme.com/Funerals-Memorials-Tributes/",
"https://www.gofundme.com/Sports-Teams-Clubs/",
"https://www.gofundme.com/Animals-Pets/",
"https://www.gofundme.com/Non-Profits-Charities/",
#"https://www.gofundme.com/Weddings-Honeymoons/"
]
npages = 10
for i in range(npages):
#start_urls.append("https://www.gofundme.com/Competing-in-Rio?page="+str(i+2)+"")
start_urls.append("https://www.gofundme.com/Medical-Illness-Healing?page="+str(i+2)+"")
start_urls.append("https://www.gofundme.com/Volunteer-Service?page="+str(i+2)+"")
start_urls.append("https://www.gofundme.com/Education-Schools-Learning?page="+str(i+2)+"")
start_urls.append("https://www.gofundme.com/Accidents-Personal-Crisis?page="+str(i+2)+"")
start_urls.append("https://www.gofundme.com/Funerals-Memorials-Tributes?page="+str(i+2)+"")
start_urls.append("https://www.gofundme.com/Sports-Teams-Clubs?page="+str(i+2)+"")
start_urls.append("https://www.gofundme.com/Animals-Pets?page="+str(i+2)+"")
start_urls.append("https://www.gofundme.com/Non-Profits-Charities?page="+str(i+2)+"")
#start_urls.append("https://www.gofundme.com/Weddings-Honeymoons?page="+str(i+2)+"")
def parse(self, response):
for href in response.xpath('//html/body/div[5]/div[2]/div/a/@href')[1:]:
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = DmozItem()
item['title'] = response.xpath('//html/body/div[2]/div[1]/h1/text()').extract()
item['target'] = response.xpath('//html/body/div[2]/div[2]/div[2]/h2/span/text()').extract()
item['current'] = response.xpath('//html/body/div[2]/div[1]/div[4]/h2/strong/text()').extract()
item['people'] = response.xpath('//html/body/div[2]/div[1]/div[4]/div[2]/span[1]/text()').extract()
item['date'] = response.xpath('//html/body/div[2]/div[2]/div[3]/text()').extract()
#item['name'] = response.xpath('///*[@id="story"]/div[1]/div[2]/text()').extract()
item['location'] = response.xpath('//html/body/div[2]/div[2]/div[5]/div[2]/a[2]/text()').extract()
item['shares'] = response.xpath('//html/body/div[2]/div[1]/div[5]/div[1]/div[3]/div/strong/text()').extract()
#item['story'] = response.xpath('///*[@id="story"]/div[3]/text()').extract()
item['category'] = response.xpath('//html/body/div[2]/div[2]/div[5]/div[2]/a[1]/span/text()').extract()
print item
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment