-
-
Save ryr/6a2d8997057a70be7eb3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import random | |
from datetime import datetime | |
from decimal import Decimal | |
import grequests | |
from pony.orm import Database, Optional, Required, PrimaryKey, LongUnicode, db_session | |
from pyquery import PyQuery as pq | |
# https://gist.github.com/ryr/e61414b94055bd1a9659 | |
from adapter import requests_session, request_rate_limit | |
user_agent = [ | |
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30', | |
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36 OPR/20.0.1387.64 (Edition Yx)', | |
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36 OPR/20.0.1387.64 (Edition Yx)', | |
'Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0', | |
'Mozilla/5.0 (Linux; U; Android 4.1.2; ru-ru; GT-P3100 Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Safari/534.30', | |
'Mozilla/5.0 (Linux; U; Android 4.1.2; ru; LG-P715 Build/JZO54K) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 UCBrowser/9.5.0.360 U3/0.8.0 Mobile Safari/533.1', | |
'Opera/9.80 (Android; Opera Mini/7.28879/27.1662; U; ru) Presto/2.8.119 Version/11.10 UCBrowser/8.6.1.262/145/33482/', | |
'Mozilla/4.0 (compatible;Android;320x480)', | |
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)', | |
] | |
db = Database('mysql', host='localhost', user='root', passwd='q1w2e3r4', db='viasun') | |
class Tophotel(db.Entity): | |
_table_ = 'hotel_tophotel' | |
hotel_id = PrimaryKey(int, auto=True, column='TophotelID') | |
content = Optional(LongUnicode, column='Content') | |
placement = Optional(Decimal, 2, 1, column='Placement') | |
service = Optional(Decimal, 2, 1, column='Service') | |
food = Optional(Decimal, 2, 1, column='Food') | |
rating = Optional(Decimal, 2, 1, column='Rating') | |
votes = Optional(int, column='Votes') | |
class Comment(db.Entity): | |
_table_ = 'hotel_comment' | |
comment_id = PrimaryKey(int, auto=True, column='CommentID') | |
hotel_id = Required(int, column='TophotelID', index=True) | |
name = Optional(unicode, column='Name') | |
created_at = Optional(datetime, column='Created') | |
content = Optional(LongUnicode, column='Content') | |
placement = Optional(int, column='Placement') | |
service = Optional(int, column='Service') | |
food = Optional(int, column='Food') | |
rating = Optional(Decimal, 2, 1, column='Rating') | |
db.generate_mapping(create_tables=True) | |
rates = { | |
'5': 10, | |
'5-': 9, | |
'4': 8, | |
'4-': 7, | |
'3': 6, | |
'3-': 5, | |
'2': 4, | |
'2-': 3, | |
'1': 2, | |
'1-': 1, | |
'0': 0, | |
} | |
@db_session | |
def add_comments(hotel_id, comments): | |
for i in xrange(len(comments)): | |
comment = comments.eq(i) | |
url = re.search('/main/viewrate/\?id=(\d+)', comment.find('a.reviews-item-title-link').attr['href']) | |
if not url: | |
continue | |
comment_id = url.groups()[0] | |
name = comment.find('.reviews-item-user-name').text() or comment.find('.reviews-item-user-login').text() | |
created_at = datetime.strptime(comment.find('.reviews-item-time').text().strip(' \t\n\r'), "%d.%m.%Y %H:%M") | |
content = comment.find('.reviews-item-text').text().strip(' \t\n\r') | |
placement = rates.get(comment.find('.reviews-item-rating-td_num').eq(0).text(), 0) | |
food = rates.get(comment.find('.reviews-item-rating-td_num').eq(1).text(), 0) | |
service = rates.get(comment.find('.reviews-item-rating-td_num').eq(2).text(), 0) | |
rating = round(float(placement + food + service) / 3, 1) | |
if not get_comment(comment_id): | |
Comment(hotel_id=int(hotel_id), | |
comment_id=int(comment_id), | |
name=name, | |
created_at=created_at, | |
content=content, | |
placement=placement, | |
service=service, | |
food=food, | |
rating=rating, | |
) | |
@db_session | |
def update_comment(comment_id, content): | |
c = Comment[comment_id] | |
c.content = content | |
@db_session | |
def get_comment(comment_id): | |
return Comment.get(comment_id=comment_id) | |
@db_session | |
def add_hotel(hotel_id, content, votes, rating, placement, service, food): | |
if not get_hotel(hotel_id): | |
if type(rating) == unicode: | |
rating = 0.0 | |
Tophotel(hotel_id=int(hotel_id), | |
content=content, | |
votes=int(votes), | |
rating=float(rating) * 2, | |
placement=float(placement) * 2, | |
service=float(service) * 2, | |
food=float(food) * 2, | |
) | |
@db_session | |
def get_hotel(hotel_id): | |
return Tophotel.get(hotel_id=hotel_id) | |
class ProcessResponse(object): | |
total = 0 | |
processed = 0 | |
current = 0 | |
code = '' | |
def __init__(self, total): | |
super(ProcessResponse, self).__init__() | |
self.total = total | |
def process_url(self, r, *args, **kwargs): | |
self.current += 1 | |
self.code = r.status_code | |
if r.status_code == 200: | |
self.parse(r) | |
print '[%s]%s - %s' % (self.status(), self.code, r.url) | |
def parse(self, response): | |
# self.parse_hotel(response) | |
self.parse_comment(response) | |
def parse_comment(self, response): | |
self.code = '' | |
url = re.search('http://tophotels\.ru/main/viewrate/\?id=(\d+)', response.url) | |
if not url: | |
self.code = '--' | |
return | |
comment_id = url.groups()[0] | |
q = pq(response.text) | |
content = "\r\n".join(q(p).text().strip(' \t\n\r') for p in q('.reviews-item-text').children()) | |
update_comment(comment_id, content) | |
self.processed += 1 | |
def parse_hotel(self, response): | |
self.code = '' | |
url = re.search('http://tophotels\.ru/main/hotel/al(\d+)', response.url) | |
if not url: | |
self.code = '--' | |
return | |
hotel_id = url.groups()[0] | |
q = pq(response.text) | |
content = q('#hotel_link_form_place') | |
if len(content): | |
self.code = '-' | |
return | |
content = q('.hotel-text') | |
for e in content.children(): | |
if not len(pq(e).text()) or pq(e).has_class('lnb') or pq(e).has_class('print') or pq(e).has_class('clear_both'): | |
pq(e).remove() | |
content = content.html().strip(' \t\n\r') | |
votes = q('.dynamics-rating-brown') | |
votes = votes.text() | |
rating = q('.dynamics-rating') | |
rating = rating.text().replace(',', '.') | |
placement, service, food = None, None, None | |
hotel_rates = q('.dynamics-sub-r-rating') | |
if len(hotel_rates) >= 3: | |
placement = hotel_rates.eq(0).text().replace(',', '.') | |
service = hotel_rates.eq(1).text().replace(',', '.') | |
food = hotel_rates.eq(2).text().replace(',', '.') | |
self.processed += 1 | |
add_hotel(hotel_id, content, votes, rating, placement, service, food) | |
comments = q(".reviews-item") | |
if len(comments): | |
add_comments(hotel_id, comments) | |
def status(self): | |
return '%s|%s|%s' % (self.total, self.processed, self.current) | |
ids = xrange(12210, 150000) | |
process = ProcessResponse(len(ids)) | |
rs = (grequests.get('http://tophotels.ru/main/viewrate/?id=%s' % item.comment_id, | |
headers={'User-Agent': random.choice(user_agent)}, | |
allow_redirects=False, | |
session=requests_session, | |
hooks=dict(response=process.process_url)) for item in ids) | |
responses = grequests.map(rs, size=request_rate_limit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment