Skip to content

Instantly share code, notes, and snippets.

@gaurav-gogia
Created August 24, 2018 19:35
Show Gist options
  • Save gaurav-gogia/056e8fd8696e95e9d675541961d3e255 to your computer and use it in GitHub Desktop.
Save gaurav-gogia/056e8fd8696e95e9d675541961d3e255 to your computer and use it in GitHub Desktop.
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
def test_clusters(data_series, eps_val, swap_index, swap_value):
data_series[swap_index] = swap_value
ff = pd.DataFrame(data_series, columns=['fare']).reset_index()
x = StandardScaler().fit_transform(ff)
db = DBSCAN(eps=eps_val, min_samples=1).fit(x)
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
colors = plt.get_cmap('Spectral')(np.linspace(0, 1, len(unique_labels)))
plt.subplots(figsize=(12, 8))
for k, c in zip(unique_labels, colors):
class_member_mask = (labels == k)
xy = x[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
markeredgecolor='k', markersize=14)
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
plt.show()
url = "https://www.google.co.in/flights#flt=AMD.DEL.2018-09-01*DEL.AMD.2018-09-05;c:INR;e:1;sd:1;t:f"
chrome_driver = "chromedriver"
x1 = '//*[@id="flt-app"]/div[2]/main[2]/div[9]/div[1]/div[2]/div[5]/div[2]/ol/li[{}]/div/div[1]/div[2]/div[1]/div[1]/div[6]/div[1]/jsl[1]/jsl'
x2 = '//*[@id="flt-app"]/div[2]/main[2]/div[9]/div[1]/div[2]/div[5]/div[4]/div[1]/ol/li[{}]/div/div[1]/div[2]/div[1]/div[1]/div[6]/div[1]/jsl[1]/jsl'
more_flights = 'gws-flights-results__dominated-link'
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(chrome_options=chrome_options,
executable_path=chrome_driver)
driver.get(url)
os.system('clear')
print 'Loading . . . .'
time.sleep(5)
a = driver.find_element_by_class_name(more_flights)
a.click()
os.system('clear')
print 'Getting Prices . . . .'
def get_prices(lim, x):
i = 1
price_list = []
while i <= lim:
z = x.format(i)
i += 1
elem = driver.find_element_by_xpath(z)
price = elem.text[2:]
price = price.replace(',', '')
price_list.append(int(price))
return price_list
prices = get_prices(5, x1)
prices.extend(get_prices(60, x2))
os.system('clear')
fares = pd.DataFrame(prices, columns=['price'])
fig, ax = plt.subplots(figsize=(10, 6))
plt.scatter(np.arange(len(fares['price'])), fares['price'])
px = [x for x in fares['price']]
ff = pd.DataFrame(px, columns=['fare']).reset_index()
x = StandardScaler().fit_transform(ff)
db = DBSCAN(eps=.5, min_samples=1).fit(x)
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
colors = plt.get_cmap('Spectral')(np.linspace(0, 1, len(unique_labels)))
plt.subplots(figsize=(12, 8))
for k, c in zip(unique_labels, colors):
class_member_mask = (labels == k)
xy = x[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
markeredgecolor='k', markersize=14)
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
test_clusters(px, 1, 10, 7000)
pf = pd.concat([ff, pd.DataFrame(db.labels_, columns=['cluster'])], axis=1)
# aggregate by cluster
rf = pf.groupby('cluster')['fare'].agg(['min', 'count'])
rf.describe([.10, .25, .5, .75, .9])
print rf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment