Skip to content

Instantly share code, notes, and snippets.

@Barlog951
Forked from MichalCab/aautomats_main_info.md
Created September 29, 2015 16:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Barlog951/b4ec28be3864598062ac to your computer and use it in GitHub Desktop.
Save Barlog951/b4ec28be3864598062ac to your computer and use it in GitHub Desktop.
Booking automata
# -*- coding: utf-8 -*-
import sys
#sys.path.append('/srv/scrapers')
#sys.path.append('/srv/scrapers/simple')
import re
import argparse
import logging
import csv
import requests
import json
import lxml.html
import pycurl
import pytz
import socket
import traceback
import ujson
from pytz import country_timezones
import random
from random import choice
from decimal import Decimal
from itertools import islice
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from requests import Session
from random import choice
from collections import OrderedDict
from grab import Grab
from pprint import pprint as pp
import os
from time import sleep
import time
sys.path.append('/root/Scrapers')
sys.path.append('/srv/Scrapers')
from scraperlib.s_grab import *
booking_proxies = [
"192.81.214.211:8888",
"37.139.23.93:8888",
"128.199.221.61:8888",
"178.62.50.177:8888",
"188.226.169.149:8888",
"192.81.212.107:8888",
"107.170.165.55:8888",
]
dev_ips = [
"146.185.172.28",
"188.166.6.171",
]
tz = pytz.timezone(country_timezones("CZ")[0])
def save_file(filename = "test.html", path = "/srv/Scrapers/booking/airlines/html/", body = ""):
"""Save html page to file, with timestamp in filename"""
final_name = "%s%s_%s" % (path, filename, datetime.now(tz).strftime("%H-%M_%d-%m-%Y"))
final_name = final_name.replace(".html","")
final_name += ".html"
print "saving page as %s ..." % final_name
with open(final_name,"wb+") as f:
f.write(body)
class error_payment_failed(Exception):
def __init__(self, info=""):
self.message = "payment_failed"
self.info = info
class BaseAirline(object):
"""Base class for all airline scrapers. Provides interface to be
implemented and some useful tools for scraping shit.
"""
#! IATA airline code.
code = None
childs_max_age = None
folder = '/srv/results/'
output = ''
html_url = ""
price = 0.0
def __init__(self):
"""Initializes airline scraper."""
assert self.code, "IATA airline code must be defined."
self._session = Session()
def _to_html(self, response):
"""Parses given HTTP response into HTML DOM object.
:param response: HTTP response.
:type response: :class:`requests.Response`
"""
return lxml.html.fromstring(response.text)
def _to_price(self, value, thousand_sep=',', dec_sep='.'):
"""Parses given string into Decimal object holding
amount of currency. Separator defaults are set according to
English customs.
"""
value = value.replace(thousand_sep, '').replace(dec_sep, '.')
value = re.sub(r'[^\d\.]', '', value)
return Decimal(value)
def parse_args(self):
parser = argparse.ArgumentParser()
parser.add_argument("--json_name", type=str,
help="path to json file")
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
parser.parse_args()
args = parser.parse_args()
with open(args.json_name,"r") as f:
content =f.read()
self.json_data = json.loads(content)
count = 0
for flight in self.json_data["flights"]:
self.json_data["flights"][count]['departure'] = datetime.strptime(flight['departure'], "%Y-%m-%d %H:%M")
self.json_data["flights"][count]['arrival'] = datetime.strptime(flight['arrival'], "%Y-%m-%d %H:%M")
count += 1
count = 0
for p in self.json_data["passengers"]:
self.json_data["passengers"][count]['birthday'] = datetime.strptime(p['birthday'], "%Y-%m-%d")
count += 1
self.json_data["exp"] = datetime.strptime(self.json_data["exp"], "%m/%y")
def prepare_input(self):
assert self.childs_max_age, "childs_max_age is not set."
""" prepare input data for booking scraper (childs, return_flight, userfriendly names)"""
passengers = sorted(self.json_data["passengers"], key=lambda x:x["birthday"])
childs = [p for p in passengers if p["birthday"] > datetime.now() - timedelta(days = 365.25*self.childs_max_age)]
childs_b_days = [p["birthday"] for p in passengers if p["birthday"] > datetime.now() - timedelta(days = 365.25*self.childs_max_age)]
#TODO new
#infants = [p for p in childs if p["birthday"] > datetime.now() - timedelta(days = 365.25*self.infants_max_age)]
#infants_b_days = [p["birthday"] for p in childs if p["birthday"] > datetime.now() - timedelta(days = 365.25*self.infants_max_age)]
#TODO new
flights = self.json_data["flights"]
baggage = 0
for x in self.json_data["passengers"]:
baggage += int(x["bags"])
card = self.json_data["cardnumber"]
cvc = self.json_data["CVV"]
expiration = self.json_data["exp"]
max_price = self.json_data["maxprice"]
email = self.json_data["email"]
phone = self.json_data["phone"]
holder = self.json_data["nameoncard"]
return_flight = None
if len(flights) > 1:
if flights[1]["departure"] > flights[0]["departure"]:
return_flight = flights[1]
departure_flight = flights[0]
else:
return_flight = flights[0]
departure_flight = flights[1]
else:
departure_flight = flights[0]
g = sGrab()
g.transport.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
g.transport.curl.setopt(pycurl.SSL_VERIFYHOST, 0)
#g.transport.curl.setopt(pycurl.SSLVERSION, 3)
g.transport.curl.setopt(pycurl.SSL_CIPHER_LIST, 'SSLv3')
if [(s.connect(('8.8.8.8', 80)), s.getsockname()[0], s.close()) for s in [socket.socket(socket.AF_INET, socket.SOCK_DGRAM)]][0][1] not in dev_ips:
proxy = choice(booking_proxies)
print "I will use this proxy %s" % proxy
self.proxy = proxy
g.setup(proxy=proxy, proxy_type='http', connect_timeout=5, timeout=5)
g.setup(hammer_mode=True, hammer_timeouts=((40, 40),))
# set default adult if only child is in input and it is only check!
if (len(passengers) - len(childs)) is 0 and str(int(max_price)) is "1":
childs = []
passengers = [ {
"bags" : baggage,
"birthday" : datetime.now() - timedelta(days = 365.25 * 25),
"cardno" : "A9449115",
"doctype" : "P",
"expiration" : "2064-11-19",
"familyname" : "Novak",
"firstname" : "Jan",
"nationality" : "CZ",
"title" : "Mr",
"visa" : None
} ]
print "PASS %s" % ", ".join(["%s %s" % (p["firstname"], p["familyname"]) for p in passengers])
return (passengers, childs, baggage, card, cvc, expiration,
max_price, email, phone, holder, departure_flight, return_flight, g, childs_b_days)
def _to_datetime(self, value, format='%Y-%m-%dT%H:%M:%S'):
"""Parses :class:`datetime.datetime`. If both *value* and *format*
given as iterables of two elements, they're considered
separately as date and time.
"""
value_is_str = isinstance(value, basestring)
format_is_str = isinstance(format, basestring)
if not value_is_str and not format_is_str:
# parse date and time separately, then combine
return datetime.combine(
datetime.strptime(value[0], format[0]).date(),
datetime.strptime(value[1], format[1]).timetz(),
)
if value_is_str and format_is_str:
# parse date and time together
return datetime.strptime(value, format)
else:
raise TypeError("Unrecognized combination of arguments.")
def save_file(self, filename = "page.html", path = "/srv/Scrapers/booking/airlines/html/", body = ""):
"""Save html page to file, with timestamp in filename"""
path += "%s/" % self.code
if not os.path.exists(path):
os.makedirs(path)
tz = pytz.timezone(country_timezones("CZ")[0])
bid = 0
try:
if "bid" not in self.json_data:
bid = 0
else:
bid = self.json_data["bid"]
except Exception, e:
print e
final_name = "%s%s_%s_%s_%s" % (path, filename, bid, datetime.now(tz).strftime("%H-%M_%d-%m-%Y"), random.random())
final_name = final_name.replace(".html","")
final_name += ".html"
print "saving page as %s ..." % final_name
with open(final_name,"wb+") as f:
f.write(body)
self.html_url = final_name
def output_error(self, msg = 'not_found', data = "", html_url = ""):
html_url = self.html_url
sys.stderr.write(ujson.dumps({'error':msg, 'data':data, 'html_url':html_url, 'datetime':datetime.now(tz).strftime("%d-%m-%Y_%H:%M")}))
sys.exit(1)
#depreciate
def output_pricechange(self, new_price = 1):
sys.stderr.write(json.dumps({"max_price":self.json_data["maxprice"],"fresh_price":float(new_price), "timestamp": datetime.now(tz).strftime("%d-%m-%Y_%H:%M")}))
sys.exit(1)
# use this
def check_price(self, amount, currency):
amount = self._to_price(str(amount))
print amount, currency
skypicker_currency_url = "https://cz.skypicker.com/rates/"
rates = self._session.get(skypicker_currency_url + currency.lower()).json()
amount = float(rates['value']) * float(amount)
if float(amount) > float(self.json_data["maxprice"]):
error_msg = {
"ids":[f['id'] for f in self.json_data["flights"]],
"max_price":self.json_data["maxprice"],
"fresh_price":float(amount),
"status":"price_change",
"datetime":datetime.now(tz).strftime("%H-%M_%d-%m-%Y")
}
sys.stderr.write(json.dumps(error_msg))
exit(1)
self.start_time = time.time()
#self.waiting_on_semaphore()
self.price = amount
print amount, "EUR"
def waiting_on_semaphore(self):
semaphore_url = "https://cz.skypicker.com/api/v0.1/automatic_booking_process_status"
status_data = self._session.get(semaphore_url + "?bid=%s&iata=%s" % (self.json_data["bid"], self.code)).json()
if status_data["status"] is "canceled":
raise Exception("payment canceled")
if status_data["status"] is "pending":
if time.time() - self.start_time > (60 * 5):
self.book_flight()
else:
sleep(5)
self.waiting_on_semaphore()
if status_data["status"] is "ok":
self.json_data.update(status_data) #update card data
def output_res_number(self, reservation_number, additional_info={}):
response = {"reservation_number":reservation_number, "price": str(self.price), 'html_url':self.html_url, 'datetime': datetime.now(tz).strftime("%d-%m-%Y_%H:%M")}
response.update(additional_info)
sys.stderr.write(json.dumps(response))
return True
#sys.exit(1)
## helpers
#depreciate
def to_eur(self,curr,amount):
rates = self._session.get("https://cz.skypicker.com/rates/" + curr.lower()).json()
return round(rates['value'] * amount,2)
def parse_price(self,string):
prices = re.findall(r"([\d+]+)",string)
float_places = 0
full_numbers = 0
if len(prices) == 1:
float_places = float(prices[0])
elif len(prices) > 1:
if len(prices[-1]) != 3:
float_places = float("0."+prices[-1])
full_numbers = int("".join(prices[0:-1]))
else:
full_numbers = int("".join(prices))
return float(float_places) + float(full_numbers)
#helper for develop
def compare_dicts(self, original, used):
for k,v in original.items():
if k in used:
if str(v) != str(used[k]):
print "KEY: %s ... >%s<(orig) VS >%s<(script)" % (k, v, used[k])
else:
print "KEY %s:%s is not in params" % (k, v)
SEARCH_FAILED = "search_failed"
PAYMENT_FAILED = "payment_failed"
PRICE_CHANGED = "price_changed"
GETTING_RES_CODE_FAILED = "getting_res_code_failed"
UNKNOW_CURRENCY = "unknow_currency"
FLIGHT_NOT_FOUND = "flight_not_found"
BOOKING_ON_MAIL = "booking_on_mail"
CANT_BOOK_BAGS = "cant_book_bags"
CANT_BOOK_BABY = "cant_book_baby"
DUPLICATE_NAMES = "duplicate_names"
UNEXPECTED_ERROR = "unexpected_error"
AIRLINE_WEB_DOWN = "airline_web_down"
LOGIN_FAILED = "login_failed"
USE_POLICY = "use_policy"
ERR_CODES = {
SEARCH_FAILED:"Search failed. %s",
PAYMENT_FAILED:"Payment failed. %s",
PRICE_CHANGED:"Price changed. %s",
GETTING_RES_CODE_FAILED:"Problem with get reservation code. %s",
UNKNOW_CURRENCY:"Unknow currency. %s",
FLIGHT_NOT_FOUND:"Flight not found. %s",
BOOKING_ON_MAIL:"Booking on email. %s",
CANT_BOOK_BAGS:"Can't book bags. %s",
CANT_BOOK_BABY:"Can't book baby or child. %s",
DUPLICATE_NAMES:"Passengers with same name did not pass validation. %s",
UNEXPECTED_ERROR:"Unexpected error %s",
AIRLINE_WEB_DOWN:"Arline website seems down %s",
LOGIN_FAILED:"Can't log in on airline website %s",
USE_POLICY:"Airline blocking our booking automatas %s"
}
html_url_path = "http://www3.skypicker.com:12555/last_page_of_booking/"
class BookingError(Exception):
def __init__(self, error_code, info="", html_url=""):
self.error_code = error_code
if html_url != "":
html_url = html_url_path + html_url.split("/")[-1]
self.html_url = html_url
try:
self.message = ERR_CODES[error_code] % info
except Exception, e:
print "%s not found!" % error_code
self.message = ERR_CODES[UNEXPECTED_ERROR]
def __str__(self):
return "%s: %s" % (self.error_code, self.message)
""" README
Feel free to add new error code ;)
USAGE in booking scripts:
self.output_error(SEARCH_FAILED)
self.output_error(LOGIN_FAILED)
in core.py it is called like:
if jsn.get("error") in ERR_CODES:
raise BookingError(jsn.get("error"), jsn.get("data"))
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-\n
import ujson
import urllib
import pycurl
import re
import sys
import json
import ast
import lxml
import lxml.html
import traceback
from time import sleep
from pprint import pprint as pp
from datetime import datetime, timedelta
from airlines import *
from airlines_exceptions import *
reload(sys)
sys.setdefaultencoding('utf-8')
__author__ = "your name"
__editor__ = "" #TODO
class Airline(BaseAirline):
code = "" #TODO
childs_max_age = 0 #TODO
def book_flight(self):
(passengers, childs, baggage, card, cvc, expiration, max_price, email,
phone, holder, departure_flight, return_flight, g, childs_b_days) = self.prepare_input()
#TODO magic
contact_detail = {
"title":"MR",
"firstName":"Oliver",
"lastName":"Dlouhy",
"street":"Bakalovo nabrezi 2",
"zipCode":"63900",
"city":"Brno",
"country":"CZ",
"email":email,
"repeatemail":email,
"phoneNumber":"+380"+phone,
}
self.check_price(price, currency)
g.setup(hammer_mode=True, hammer_timeouts=((300, 300),))
try:
#TODO payment
self.save_file(filename="airline.html", body=g.response.body)
self.output_res_number("not parsed yet") #TODO ask me
except Exception, e:
self.output_error(msg=PAYMENT_FAILED)
if __name__ == "__main__":
airline = Airline()
airline.parse_args()
airline.book_flight()
#!/usr/bin/env python
# -*- coding: utf-8 -*-\n
import ujson
import urllib
import pycurl
import re
import sys
import json
from lxml import etree
from time import sleep
from pprint import pprint as pp
from datetime import datetime, timedelta
from grab import Grab
from airlines_exceptions import *
from airlines import *
from airlines_exceptions import BookingError
__author__ = 'Ladislav Radoň, lada@skypicker.com'
reload(sys)
sys.setdefaultencoding('utf-8')
class EastarJet(BaseAirline):
code = "ZE"
childs_max_age = 12
def book_flight(self):
(passengers, childs, baggage, card, cvc, expiration, max_price, email,
phone, holder, departure_flight, return_flight, g, childs_b_days) = self.prepare_input()
domestic = ["GMP", "KUV", "CJU", "CJJ", "ICN"] # domestic airports (SOUTH KOREA)
# get session
# g.setup(timeout=150)
g.setup(hammer_mode=True, hammer_timeouts=((200, 200),))
g.go("http://www.eastarjet.com/book/index.htm")
post_data = {}
if departure_flight["from"] in domestic and departure_flight["to"] in domestic:
__cd_station = "DOM"
post_data.update({
"cd_fromcountry":"KR",
"cd_tocountry":"KR",
})
else:
__cd_station = "INT"
post_data.update({
"method":"quickStep",
"cd_station": __cd_station,
"cd_return": 0,
"cd_fromline": departure_flight["from"],
"nm_fromline":'',
"cd_toline": departure_flight["to"],
"nm_toline":'',
"dt_from": departure_flight["departure"].strftime("%Y-%m-%d"),
"no_person_m": len(passengers) - len(childs),
"no_person_p": len(childs),
"no_person_b": 0,
})
if return_flight:
post_data.update({
"cd_return": 1,
"dt_to": return_flight["departure"].strftime("%Y-%m-%d"),
})
pp(post_data)
g.setup(post=post_data)
g.go("http://www.eastarjet.com/book/book.htm")
# self.save_file(filename="ZE_search.html", body=g.response.body)
ajax_data = {
"method": "availability",
"dt_date": post_data["dt_from"],
"is_departure": "true",
"fromline": post_data["cd_fromline"],
"toline": post_data["cd_toline"],
"nmfromline": "",
"nmtoline": "",
"dt_from": post_data["dt_from"],
"cd_station": post_data["cd_station"],
}
if return_flight:
ajax_data.update({
"dt_to": return_flight["departure"].strftime("%Y-%m-%d"),
})
# pp(ajax_data)
g.setup(post=ajax_data)
g.go("http://www.eastarjet.com/book/bookAjax.ajax")
# print g.response.body
root = etree.XML(g.response.body)
fl_found = False
for item in root.findall('.//list'):
jkey = item.find("journeyKey").text
if (departure_flight["from"] in jkey and
departure_flight["to"] in jkey and
departure_flight["departure"].strftime("%m/%d/%Y %H:%M") in jkey and
departure_flight["arrival"].strftime("%m/%d/%Y %H:%M") in jkey):
post_data = {
"method": "bookingStep2",
"dt_fromstd": item.find('./std').text,
"dt_fromsta": item.find('./sta').text,
"nm_fromfn": item.find('./flightNumber').text,
"nm_fromjkey": jkey,
"nm_fromfkey": item.find("./secondaryLowFare/fareSellKey").text,
"nm_fromFareName": "secondaryLow",
"dt_from": departure_flight["departure"].strftime("%Y-%m-%d"),
"nm_fromfare": item.find("./secondaryLowFare/amountView").text,
"dt_tostd":'',
"dt_tosta":'',
"nm_tofn":'',
"nm_tojkey":'',
"nm_tofkey": '',
"nm_toFareName":'secondaryLow',
}
currency = item.find("./secondaryLowFare/currencyCode").text
fl_found = True
if not fl_found:
self.output_error(msg=FLIGHT_NOT_FOUND, data= "Departure flight not found")
if return_flight:
fl_found = False
ajax_data.update({
"dt_date": return_flight["departure"].strftime("%Y-%m-%d"),
"is_departure": "false",
})
pp(ajax_data)
g.setup(post=ajax_data)
g.go("http://www.eastarjet.com/book/bookAjax.ajax")
root = etree.XML(g.response.body)
for item in root.findall('.//list'):
jkey = item.find("journeyKey").text
print jkey
if (return_flight["from"] in jkey and
return_flight["to"] in jkey and
return_flight["departure"].strftime("%m/%d/%Y %H:%M") in jkey and
return_flight["arrival"].strftime("%m/%d/%Y %H:%M") in jkey):
post_data.update({
"dt_tostd": item.find('./std').text,
"dt_tosta": item.find('./sta').text,
"nm_tofn": item.find('./flightNumber').text,
"dt_to": return_flight["departure"].strftime("%Y-%m-%d"),
"nm_tofare": item.find("./secondaryLowFare/amountView").text,
"nm_tojkey": jkey,
"nm_tofkey": item.find("./secondaryLowFare/fareSellKey").text,
"nm_toFareName": "secondaryLow",
})
fl_found = True
if not fl_found:
self.output_error(msg=FLIGHT_NOT_FOUND, data= "Return flight not found")
pp(post_data)
g.setup(post=post_data)
g.go("http://www.eastarjet.com/book/bookAjax.ajax")
# print g.response.body
g.go("http://www.eastarjet.com/book/book.htm?method=bookingStep3")
# g.go("https://www.eastarjet.com/book/book.htm?method=bookingStep3")
# self.save_file(filename="ZE_select_after.html", body=g.response.body)
post_data = [
("method","bookingStep3"),
("nm_lastname", "Dlouhy"),
("nm_firstname", "Oliver"),
("cd_gender", 0),
("nm_customernumber", ""),
("nm_jumin", ""),
("nm_phone1", "+420" + phone[0]), # todo check this
("nm_phone2", phone[1:5]),
("nm_phone3", phone[5:9]),
("nm_mailid", email.split("@")[0]),
("nm_maildomain", email.split("@")[1]),
("se_maildomain", ""),
("ck_phone", "on"),
("emergency_nm_phone1", "+420" + phone[0]),
("emergency_nm_phone2", phone[1:5]),
("emergency_nm_phone3", phone[5:9]),
]
for p in passengers:
post_data.extend([
("passenger_nm_paxtype", "CHD" if p in childs else "ADT"),
("passenger_nm_customernumber", ""),
("passenger_nm_lastname", p["familyname"]),
("passenger_nm_firstname", p["firstname"]),
("passenger_cd_gender", 1 if p["title"] == "Ms" else 0),
("passenger_nm_jumin", ""),
("passenger_nm_birthday", p["birthday"].strftime("%Y%m%d")),
("passenger_cd_paytype", "CHD" if p in childs else "ADT"),
("passenger_nm_paytype","소아" if p in childs else "성인"),
])
if __cd_station == "INT":
post_data.extend([
("passport_country", "" ),
("passport_nationality", "" ),
("passport_docNo", p["cardno"]),
("passport_expDate", p["expiration"].replace("-","")),
("passport_issued", p["nationality"]),
])
pp(post_data)
g.setup(post = post_data)
g.go("https://www.eastarjet.com/book/bookAjax.ajax")
g.go("https://www.eastarjet.com/book/book.htm?method=bookingStep4")
# self.save_file(filename="ZE_payment.html", body=g.response.body)
form_data = g.form_fields()
#check price
self.check_price( form_data['no_amount'], currency)
if "cardbrand" in form_data:
form_data.update({"cardbrand": "MC"})
form_data.update({
"method": "validatePayment",
"departureStation": "",
"ArrivalStation": "",
"cd_paymenttype":"200", # Credit card - MC
"nm_accountnumber1": card[0:4],
"nm_accountnumber2": card[4:8],
"nm_accountnumber3": card[8:12],
"nm_accountnumber4": card[12:16], #2632136217836 ->> 7836 findindex(card)
"cd_expiremonth": expiration.strftime("%-m"),
"cd_expireyear": expiration.strftime("%Y"),
"nm_cardholdername": holder,
"cd_bill": "Y",
})
try:
pp(form_data)
g.setup(post=form_data)
g.setup(hammer_mode=True, hammer_timeouts=((300, 300),))
g.go('https://www.eastarjet.com/book/bookAjax.ajax')
print g.response.body
self.save_file(filename="EastarJet_ZE_0.html", body=g.response.body)
post_data = {
"method": "bookingStep4",
"cd_paymenttype": 200,
}
g.setup(post = post_data)
g.go("http://www.eastarjet.com/book/bookAjax.ajax")
print g.response.body
if "Payment is completed." not in g.response.body:
self.output_error(msg=PAYMENT_FAILED)
pnr = re.findall(r'[A-Z0-9]{6}', g.response.body)[0]
# payment process ...
self.save_file(filename="EastarJet_ZE_1.html", body=g.response.body)
g.go("https://www.eastarjet.com/book/book.htm?method=bookingStep5")
# self.save_file(filename="EastarJet_ZE.html", body=g.response.body)
self.save_file(filename="EastarJet_ZE_2.html", body=g.response.body)
# res number
self.output_res_number(pnr)
except Exception, e:
self.save_file(filename="EastarJet_ZE_payment_err.html", body=g.response.body)
print traceback.format_exc(e)
self.output_error(msg=PAYMENT_FAILED)
if __name__ == '__main__':
airline = EastarJet()
airline.parse_args()
airline.book_flight()
{
"passengers":[
{
"bags":0,
"firstname":"viktoria",
"title":"Ms",
"cardno":"UA3323123",
"familyname":"stanova",
"doctype":"P",
"birthday":"1990-07-20",
"expiration":"2018-05-08",
"nationality":"SK",
"visa":""
}
],
"CVV":"666",
"maxprice":"10000",
"card_type":"MC",
"phone":"777652838",
"flights":[
{
"arrival":"2015-09-12 12:50",
"to":"CEB",
"from":"ILO",
"id":230136054,
"departure":"2015-09-12 12:10"
}
],
"cardnumber":"5164652232068386",
"airline":"5J",
"exp":"12\/19",
"login":"booking@skypicker.com",
"password":"tramtararatata",
"email":"booking@skypicker.com",
"nameoncard":"skypicker skypicker"
}
# -*- coding: utf-8 -*-
import sys
#sys.path.append('/srv/scrapers')
#sys.path.append('/srv/scrapers/simple')
import re
import argparse
import logging
import csv
import requests
import json
import lxml.html
import pycurl
import pytz
import socket
import traceback
import ujson
from pytz import country_timezones
from random import choice
from decimal import Decimal
from itertools import islice
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from requests import Session
from random import choice
from collections import OrderedDict
from grab import Grab
from pprint import pprint as pp
class sGrabError(Exception):
def __init__(self, msg, action_name, e=None):
self.msg = msg
try:
_grab_log_error(msg, action_name)
except Exception, e:
print traceback.format_exc(e)
print "not logged"
self.e = e
if e:
print traceback.format_exc(e)
def __str__(self):
if self.e:
return self.e
else:
return self.msg
class sGrab(Grab):
"""
action_name - Name request for easier debug.
check - Check conditions ("expected_code", "expected_body_len", "expected_url")
"""
#sGrab optional
_expected_code = 200
_expected_body_len = 0
_expected_url = ""
_action_name = "booking_process"
_print_out = True
_check = False
_save_html = False
_post = None
#Grab original
make_request = True
#list of params (if Python 3.. not needed)
additional_params = ["expected_code", "expected_body_len", "expected_url", "action_name", "check", "print_out", "save_html"]
def __init__(self, *args, **kwargs):
self.set_additional(kwargs)
original_kwargs = self.delete_additional_params(kwargs)
Grab.__init__(self, *args, **original_kwargs)
def monitor(self, func, *args, **kwargs):
self.save_attributes()
self.set_additional(kwargs)
original_kwargs = self.delete_additional_params(kwargs)
try:
pre_body = self.response.body if self.response else None # set pre body
if self.make_request and self._print_out:
pre_url = self.config['url']
if len(args) == 1:
pre_url = args[0]
print "%s : from : %s" % (self._action_name, pre_url)
func(*args, **original_kwargs) #call real go or submit
if self.make_request and self._print_out:
print "%s : %s : to : %s\n" % (self._action_name, self.response.code, self.response.url)
if self._check:
if str(self._expected_code) not in self.response.status:
raise sGrabError("Another http error code, expected is %s but get %s" % (self._expected_code, self.response.status), self._action_name)
if self._expected_url not in self.response.url:
raise sGrabError("Expecting redirection to %s but redirected to %s" % (self._expected_url, self.response.url), self._action_name)
if self._expected_body_len > len(self.response.body):
raise sGrabError("Too small response body (%s bytes) expected more then %s bytes" % (self._expected_body_len, self.response.body), self._action_name)
except Exception, e:
exc_info = sys.exc_info()
#print traceback.format_exc(e)
if self._post:
pp(self._post)
if self._save_html:
if pre_body:
self._grab_save_file(filename="airline-before-%s.html" % self._action_name, body=pre_body)
self._grab_save_file(filename="airline-after-%s.html" % self._action_name, body=self.response.body)
raise exc_info[1], None, exc_info[2]
self.restore_attributes()
def setup(self, *args, **kwargs):
if kwargs:
if kwargs.get("post", None):
self._post = kwargs.get("post")
self.set_additional(kwargs)
original_kwargs = self.delete_additional_params(kwargs)
super(sGrab, self).setup(*args, **original_kwargs)
def submit(self, *args, **kwargs):
self.monitor(super(sGrab, self).submit, *args, **kwargs)
def go(self, *args, **kwargs):
self.monitor(super(sGrab, self).go, *args, **kwargs)
def set_additional(self, kwargs):
#sGrab optional
for param in self.additional_params:
setattr(self, "_"+param, kwargs.get(param, getattr(self, "_"+param)))
#original from Grab
self.make_request = kwargs.get("make_request", self.make_request)
def save_attributes(self):
for param in self.additional_params:
setattr(self, "_back_up_"+param, getattr(self, "_"+param))
def restore_attributes(self):
for param in self.additional_params:
setattr(self, "_"+param, getattr(self, "_back_up_"+param))
def delete_additional_params(self, kwargs):
originals = {}
for key, val in kwargs.items():
if key not in self.additional_params:
originals.update({key: val})
return originals
def _grab_save_file(self, filename = "test.html", path = "/srv/Scrapers/booking/airlines/html/", body = ""):
"""Save html page to file, with timestamp in filename"""
tz = pytz.timezone(country_timezones("CZ")[0])
final_name = "%s%s_%s" % (path, filename, datetime.now(tz).strftime("%H-%M_%d-%m-%Y"))
final_name = final_name.replace(".html","")
final_name += ".html"
print "saving page as %s ..." % final_name
print "url %s/last_page_of_booking/%s" % ("www3.skypicker.com:12555", final_name.split("/")[-1])
with open(final_name,"wb+") as f:
f.write(body)
def _grab_log_error(msg, action_name):
tz = pytz.timezone(country_timezones("CZ")[0])
file_name = "/var/log/s_grab.log"
with open(file_name, 'a') as f:
body = "%s:%s: %s\n" % (datetime.now(tz).strftime("%d-%m-%Y %H:%M"), action_name, msg)
f.write(body)

sGrab

  • expected_code - if expected_code not in self.response.status
  • expected_body_len - if expected_body_len > len(self.response.body)
  • expected_url - if expected_url not in self.response.url
  • action_name - name request
  • check - will check all conditions (expected_code, expected_body_len, expected_url)
  • print_out - print out request and response info
  • save_html - save before and after html files

Examples

create object

g = sGrab(expected_code=200, print_out=True, save_html=True)

set up

g.setup(expected_code=200, print_out=True, save_html=True)

turn on check

g.setup(check=True)

requests

g.go("www.ryanair.com/Search", action_name="search", expected_url="Selection")
g.go("www.ryanair.com/not_found", expected_code=404)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment