Skip to content

Instantly share code, notes, and snippets.

@clemfromspace
Created May 18, 2016 22:16
Show Gist options
  • Save clemfromspace/74f322298c132ccfd1182a2608091245 to your computer and use it in GitHub Desktop.
Save clemfromspace/74f322298c132ccfd1182a2608091245 to your computer and use it in GitHub Desktop.
import re
from scrapy import Request
from scrapy.spider import CrawlSpider
from urlparse import urljoin
class TripAdvisorSpider(CrawlSpider):
name = 'tripadvisor'
allowed_domains = ['www.tripadvisor.fr']
start_urls = (
'https://www.tripadvisor.fr/Restaurants-g187147-Paris_Ile_de_France.html',
)
def build_review_full_link(self, review_id, response):
pattern = re.compile(ur'-d(\d+)')
request_id = re.search(pattern, response.url).groups()[0]
review_url = 'https://www.tripadvisor.fr/ExpandedUserReviews-d%(request_id)s' \
'?target=%(review_id)s&reviews=%(review_id)s&servlet=Attraction_Review&expand=0' % {
'review_id': review_id,
'request_id': request_id
}
return review_url
def parse(self, response):
for page_link in response.xpath('//a[contains(@class, "pageNum")]/@href').extract():
yield Request(
urljoin(response.url, page_link),
self.parse
)
for place_link in response.xpath('//h3[@class="title"]/a/@href').extract():
yield Request(
urljoin(response.url, place_link),
self.parse_place
)
def parse_place(self, response):
for place_elem in response.xpath('//div[contains(@class,"reviewSelector")]'):
review_id = place_elem.xpath('./@id').extract()[0].replace('review_', '')
try:
review = place_elem.xpath('.//p[@class="entry"]').extract()[0]
print(review)
except IndexError:
review_url = self.build_review_full_link(review_id, response)
yield Request(
review_url,
callback=self.parse_review
)
def parse_review(self, response):
print(response.xpath('//div[@class="entry"]/p/text()').extract()[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment