Skip to content

Instantly share code, notes, and snippets.

@rdesfo
Created May 5, 2016 15:03
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rdesfo/44fda923f75fca4b050f73a1b210a20c to your computer and use it in GitHub Desktop.
Save rdesfo/44fda923f75fca4b050f73a1b210a20c to your computer and use it in GitHub Desktop.
scrapy bath restaurants
import scrapy
class TripadvisorSpider(scrapy.Spider):
name = 'tripadvisor'
allow_domains = ['tripadvisor.com']
start_urls = [
'http://www.tripadvisor.com/Restaurants-g40505-Bath_Maine.html'
]
custom_settings = {
'FEED_EXPORT_FIELDS': ["name", "rating", "latitude", "longitude", "url"]
}
def parse(self, response):
urls = response.xpath('//h3[@class="title"]/a/@href').extract()
for url in urls:
absolute_url = response.urljoin(url)
# yield {'Url': absolute_url}
yield scrapy.Request(absolute_url, callback=self.parse_restaurant)
# next page
next_page_url = response.xpath('//a[text()="Next"]/@href').extract_first()
next_absolute_url = response.urljoin(next_page_url)
request = scrapy.Request(next_absolute_url, callback=self.parse)
yield request
def parse_restaurant(self, response):
rating = response.xpath('//img[@property="ratingValue"]/@content').extract()
name = response.xpath('//div[@class="mapContainer"]/@data-name').extract()
latitude = response.xpath('//div[@class="mapContainer"]/@data-lat').extract()
longitude = response.xpath('//div[@class="mapContainer"]/@data-lng').extract()
url = response.url
yield { 'name': name
, 'rating': rating
, 'latitude': latitude
, 'longitude': longitude
, 'url': url
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment