Skip to content

Instantly share code, notes, and snippets.

@ryr

ryr/crawler.py Secret

Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ryr/6a2d8997057a70be7eb3 to your computer and use it in GitHub Desktop.
Save ryr/6a2d8997057a70be7eb3 to your computer and use it in GitHub Desktop.
import re
import random
from datetime import datetime
from decimal import Decimal
import grequests
from pony.orm import Database, Optional, Required, PrimaryKey, LongUnicode, db_session
from pyquery import PyQuery as pq
# https://gist.github.com/ryr/e61414b94055bd1a9659
from adapter import requests_session, request_rate_limit
user_agent = [
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36 OPR/20.0.1387.64 (Edition Yx)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36 OPR/20.0.1387.64 (Edition Yx)',
'Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0',
'Mozilla/5.0 (Linux; U; Android 4.1.2; ru-ru; GT-P3100 Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Safari/534.30',
'Mozilla/5.0 (Linux; U; Android 4.1.2; ru; LG-P715 Build/JZO54K) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 UCBrowser/9.5.0.360 U3/0.8.0 Mobile Safari/533.1',
'Opera/9.80 (Android; Opera Mini/7.28879/27.1662; U; ru) Presto/2.8.119 Version/11.10 UCBrowser/8.6.1.262/145/33482/',
'Mozilla/4.0 (compatible;Android;320x480)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
]
db = Database('mysql', host='localhost', user='root', passwd='q1w2e3r4', db='viasun')
class Tophotel(db.Entity):
_table_ = 'hotel_tophotel'
hotel_id = PrimaryKey(int, auto=True, column='TophotelID')
content = Optional(LongUnicode, column='Content')
placement = Optional(Decimal, 2, 1, column='Placement')
service = Optional(Decimal, 2, 1, column='Service')
food = Optional(Decimal, 2, 1, column='Food')
rating = Optional(Decimal, 2, 1, column='Rating')
votes = Optional(int, column='Votes')
class Comment(db.Entity):
_table_ = 'hotel_comment'
comment_id = PrimaryKey(int, auto=True, column='CommentID')
hotel_id = Required(int, column='TophotelID', index=True)
name = Optional(unicode, column='Name')
created_at = Optional(datetime, column='Created')
content = Optional(LongUnicode, column='Content')
placement = Optional(int, column='Placement')
service = Optional(int, column='Service')
food = Optional(int, column='Food')
rating = Optional(Decimal, 2, 1, column='Rating')
db.generate_mapping(create_tables=True)
rates = {
'5': 10,
'5-': 9,
'4': 8,
'4-': 7,
'3': 6,
'3-': 5,
'2': 4,
'2-': 3,
'1': 2,
'1-': 1,
'0': 0,
}
@db_session
def add_comments(hotel_id, comments):
for i in xrange(len(comments)):
comment = comments.eq(i)
url = re.search('/main/viewrate/\?id=(\d+)', comment.find('a.reviews-item-title-link').attr['href'])
if not url:
continue
comment_id = url.groups()[0]
name = comment.find('.reviews-item-user-name').text() or comment.find('.reviews-item-user-login').text()
created_at = datetime.strptime(comment.find('.reviews-item-time').text().strip(' \t\n\r'), "%d.%m.%Y %H:%M")
content = comment.find('.reviews-item-text').text().strip(' \t\n\r')
placement = rates.get(comment.find('.reviews-item-rating-td_num').eq(0).text(), 0)
food = rates.get(comment.find('.reviews-item-rating-td_num').eq(1).text(), 0)
service = rates.get(comment.find('.reviews-item-rating-td_num').eq(2).text(), 0)
rating = round(float(placement + food + service) / 3, 1)
if not get_comment(comment_id):
Comment(hotel_id=int(hotel_id),
comment_id=int(comment_id),
name=name,
created_at=created_at,
content=content,
placement=placement,
service=service,
food=food,
rating=rating,
)
@db_session
def update_comment(comment_id, content):
c = Comment[comment_id]
c.content = content
@db_session
def get_comment(comment_id):
return Comment.get(comment_id=comment_id)
@db_session
def add_hotel(hotel_id, content, votes, rating, placement, service, food):
if not get_hotel(hotel_id):
if type(rating) == unicode:
rating = 0.0
Tophotel(hotel_id=int(hotel_id),
content=content,
votes=int(votes),
rating=float(rating) * 2,
placement=float(placement) * 2,
service=float(service) * 2,
food=float(food) * 2,
)
@db_session
def get_hotel(hotel_id):
return Tophotel.get(hotel_id=hotel_id)
class ProcessResponse(object):
total = 0
processed = 0
current = 0
code = ''
def __init__(self, total):
super(ProcessResponse, self).__init__()
self.total = total
def process_url(self, r, *args, **kwargs):
self.current += 1
self.code = r.status_code
if r.status_code == 200:
self.parse(r)
print '[%s]%s - %s' % (self.status(), self.code, r.url)
def parse(self, response):
# self.parse_hotel(response)
self.parse_comment(response)
def parse_comment(self, response):
self.code = ''
url = re.search('http://tophotels\.ru/main/viewrate/\?id=(\d+)', response.url)
if not url:
self.code = '--'
return
comment_id = url.groups()[0]
q = pq(response.text)
content = "\r\n".join(q(p).text().strip(' \t\n\r') for p in q('.reviews-item-text').children())
update_comment(comment_id, content)
self.processed += 1
def parse_hotel(self, response):
self.code = ''
url = re.search('http://tophotels\.ru/main/hotel/al(\d+)', response.url)
if not url:
self.code = '--'
return
hotel_id = url.groups()[0]
q = pq(response.text)
content = q('#hotel_link_form_place')
if len(content):
self.code = '-'
return
content = q('.hotel-text')
for e in content.children():
if not len(pq(e).text()) or pq(e).has_class('lnb') or pq(e).has_class('print') or pq(e).has_class('clear_both'):
pq(e).remove()
content = content.html().strip(' \t\n\r')
votes = q('.dynamics-rating-brown')
votes = votes.text()
rating = q('.dynamics-rating')
rating = rating.text().replace(',', '.')
placement, service, food = None, None, None
hotel_rates = q('.dynamics-sub-r-rating')
if len(hotel_rates) >= 3:
placement = hotel_rates.eq(0).text().replace(',', '.')
service = hotel_rates.eq(1).text().replace(',', '.')
food = hotel_rates.eq(2).text().replace(',', '.')
self.processed += 1
add_hotel(hotel_id, content, votes, rating, placement, service, food)
comments = q(".reviews-item")
if len(comments):
add_comments(hotel_id, comments)
def status(self):
return '%s|%s|%s' % (self.total, self.processed, self.current)
ids = xrange(12210, 150000)
process = ProcessResponse(len(ids))
rs = (grequests.get('http://tophotels.ru/main/viewrate/?id=%s' % item.comment_id,
headers={'User-Agent': random.choice(user_agent)},
allow_redirects=False,
session=requests_session,
hooks=dict(response=process.process_url)) for item in ids)
responses = grequests.map(rs, size=request_rate_limit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment