Skip to content

Instantly share code, notes, and snippets.

@clemfromspace
Created May 19, 2016 20:26
Show Gist options
  • Save clemfromspace/51f8b153d27480debf41a094c6b67431 to your computer and use it in GitHub Desktop.
Save clemfromspace/51f8b153d27480debf41a094c6b67431 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import re
import dateparser
from scrapy import Request
from scrapy.spider import CrawlSpider
from urlparse import urljoin
from ..items import PlaceItem, ReviewItem
class TripAdvisorSpider(CrawlSpider):
"""spider for the tripadvisor website"""
name = 'tripadvisor'
allowed_domains = ['www.tripadvisor.fr']
start_urls = (
'https://www.tripadvisor.fr/Restaurants-g187147-Paris_Ile_de_France.html',
)
@staticmethod
def build_review_full_link(review_id, response):
"""Build the link to a single review"""
pattern = re.compile(ur'-d(\d+)')
request_id = re.search(pattern, response.url).groups()[0]
review_url = 'https://www.tripadvisor.fr/ExpandedUserReviews-d%(request_id)s' \
'?target=%(review_id)s&reviews=%(review_id)s&servlet=Attraction_Review&expand=0' % {
'review_id': review_id,
'request_id': request_id
}
return review_url
def parse(self, response):
"""Crawl a list of place"""
# For each pagination link, yield a new request
for page_link in response.xpath('//a[contains(@class, "pageNum")]/@href').extract():
yield Request(
urljoin(response.url, page_link),
self.parse
)
# For each place item, yield a new request
for place_link in response.xpath('//h3[@class="title"]/a/@href').extract():
yield Request(
urljoin(response.url, place_link),
self.parse_place
)
def parse_place(self, response):
"""Crawl a single place, with the reviews"""
# Extract the place
pattern = re.compile(ur'-d(\d+)')
place_id = re.search(pattern, response.url).groups()[0]
place_name = response.xpath('//h1[@id="HEADING"]/text()').extract()[1].strip()
try:
address = response.xpath('//span[@class="format_address"][descendant-or-self]/text()').extract()[0]
except IndexError:
address = None
try:
rating = response.xpath('//img[contains(@class,"rating_rr_fill")]/@content').extract()[0]
except IndexError:
rating = 0
yield PlaceItem(
id=place_id,
name=place_name,
address=address,
rating=rating
)
# Yield a new request for each page of reviews
for review_page_link in response.xpath('//div[@class="pageNumbers"]/a/@href').extract():
yield Request(
urljoin(response.url, review_page_link),
callback=self.parse_review_list,
meta={
'place_id': place_id
}
)
self.parse_review_list(response, place_id)
def parse_review_list(self, response, place_id=None):
place_id = place_id or response.meta['place_id']
# Extract the reviews
# If the review is cropped, yield a new request to fetch the full content
for review_item in response.xpath('//div[contains(@class,"reviewSelector")]'):
review_id = review_item.xpath('./@id').extract()[0].replace('review_', '')
try:
review_date = review_item.xpath('.//span[contains(@class,"ratingDate")]/@title').extract()[0]
review_date = dateparser.parse(review_date).date()
except IndexError:
review_date = review_item.xpath('.//span[contains(@class,"ratingDate")]/text()').extract()[0]
# TODO: Make the line below work
review_date = dateparser.parse(review_date)
review_title = review_item.xpath('.//span[contains(@class,"noQuotes")]/text()').extract()[0]
try:
review_body = review_item.xpath('.//p[@class="entry"]').extract()[0].strip()
yield ReviewItem(
id=review_id,
place_id=place_id,
title=review_title,
body=review_body,
date=review_date
)
except IndexError:
review_url = self.build_review_full_link(review_id, response)
yield Request(
review_url,
callback=self.parse_review,
meta={
'place_id': place_id,
'review_id': review_id,
'review_title': review_title,
'review_date': review_date
}
)
def parse_review(self, response):
"""Crawl a single review"""
review_body = response.xpath('//div[@class="entry"]/p/text()').extract()[0].strip()
yield ReviewItem(
id=response.meta['review_id'],
place_id=response.meta['place_id'],
title=response.meta['review_title'],
date=response.meta['review_date'],
body=review_body
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment