Created
August 24, 2018 19:35
-
-
Save gaurav-gogia/056e8fd8696e95e9d675541961d3e255 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from sklearn.cluster import DBSCAN | |
from sklearn.preprocessing import StandardScaler | |
def test_clusters(data_series, eps_val, swap_index, swap_value): | |
data_series[swap_index] = swap_value | |
ff = pd.DataFrame(data_series, columns=['fare']).reset_index() | |
x = StandardScaler().fit_transform(ff) | |
db = DBSCAN(eps=eps_val, min_samples=1).fit(x) | |
labels = db.labels_ | |
clusters = len(set(labels)) | |
unique_labels = set(labels) | |
colors = plt.get_cmap('Spectral')(np.linspace(0, 1, len(unique_labels))) | |
plt.subplots(figsize=(12, 8)) | |
for k, c in zip(unique_labels, colors): | |
class_member_mask = (labels == k) | |
xy = x[class_member_mask] | |
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c, | |
markeredgecolor='k', markersize=14) | |
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01) | |
plt.show() | |
url = "https://www.google.co.in/flights#flt=AMD.DEL.2018-09-01*DEL.AMD.2018-09-05;c:INR;e:1;sd:1;t:f" | |
chrome_driver = "chromedriver" | |
x1 = '//*[@id="flt-app"]/div[2]/main[2]/div[9]/div[1]/div[2]/div[5]/div[2]/ol/li[{}]/div/div[1]/div[2]/div[1]/div[1]/div[6]/div[1]/jsl[1]/jsl' | |
x2 = '//*[@id="flt-app"]/div[2]/main[2]/div[9]/div[1]/div[2]/div[5]/div[4]/div[1]/ol/li[{}]/div/div[1]/div[2]/div[1]/div[1]/div[6]/div[1]/jsl[1]/jsl' | |
more_flights = 'gws-flights-results__dominated-link' | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
chrome_options.add_argument("--window-size=1920x1080") | |
driver = webdriver.Chrome(chrome_options=chrome_options, | |
executable_path=chrome_driver) | |
driver.get(url) | |
os.system('clear') | |
print 'Loading . . . .' | |
time.sleep(5) | |
a = driver.find_element_by_class_name(more_flights) | |
a.click() | |
os.system('clear') | |
print 'Getting Prices . . . .' | |
def get_prices(lim, x): | |
i = 1 | |
price_list = [] | |
while i <= lim: | |
z = x.format(i) | |
i += 1 | |
elem = driver.find_element_by_xpath(z) | |
price = elem.text[2:] | |
price = price.replace(',', '') | |
price_list.append(int(price)) | |
return price_list | |
prices = get_prices(5, x1) | |
prices.extend(get_prices(60, x2)) | |
os.system('clear') | |
fares = pd.DataFrame(prices, columns=['price']) | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
plt.scatter(np.arange(len(fares['price'])), fares['price']) | |
px = [x for x in fares['price']] | |
ff = pd.DataFrame(px, columns=['fare']).reset_index() | |
x = StandardScaler().fit_transform(ff) | |
db = DBSCAN(eps=.5, min_samples=1).fit(x) | |
labels = db.labels_ | |
clusters = len(set(labels)) | |
unique_labels = set(labels) | |
colors = plt.get_cmap('Spectral')(np.linspace(0, 1, len(unique_labels))) | |
plt.subplots(figsize=(12, 8)) | |
for k, c in zip(unique_labels, colors): | |
class_member_mask = (labels == k) | |
xy = x[class_member_mask] | |
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c, | |
markeredgecolor='k', markersize=14) | |
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01) | |
test_clusters(px, 1, 10, 7000) | |
pf = pd.concat([ff, pd.DataFrame(db.labels_, columns=['cluster'])], axis=1) | |
# aggregate by cluster | |
rf = pf.groupby('cluster')['fare'].agg(['min', 'count']) | |
rf.describe([.10, .25, .5, .75, .9]) | |
print rf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment