Skip to content

Instantly share code, notes, and snippets.

@chribsen
Created March 21, 2016 13:32
Show Gist options
  • Save chribsen/e373c83a081f6ae8b3f4 to your computer and use it in GitHub Desktop.
Save chribsen/e373c83a081f6ae8b3f4 to your computer and use it in GitHub Desktop.
Computes spatial clusters using haversine distance and plots the output on a map. Meant for use on RF data set. Parameters haven't been tuned yet.
from __future__ import division
import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler
from data import points
import psycopg2
import matplotlib.pyplot as plt
import sys
from collections import defaultdict
import math, json
import folium
EARTH_CIRCUMFERENCE = 6378137 # earth circumference in meters
colors = ['green', 'red', 'yellow', 'blue', 'black', 'white', 'gray', 'pink', 'cloud']
# Distance formula has been obtained from this thread:
# http://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
from math import radians, cos, sin, asin, sqrt
def haversine(lat_lon1, lat_lon2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# Return huge error if the tuple contains more than 2 values
# This is necessary due to an error in the output
try:
# convert decimal degrees to radians
lat1, lon1 = tuple(lat_lon1)
lat2, lon2 = tuple(lat_lon2)
except ValueError as e:
print('failed...')
return 999 # return a huge distance, so it's not considered as a part of the cluster
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# This thread
#http://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points
R = 6371 #radius of the earth in km
x = (lon2 - lon1) * cos( 0.5*(lat2+lat1) )
y = lat2 - lat1
d = R * sqrt( x*x + y*y )
return d
conn = psycopg2.connect(database="thesis_v2", user="postgres", password="password", host="localhost")
cur = conn.cursor()
cur.execute("SELECT id FROM users")
user_ids = cur.fetchall()
for x, user_id in enumerate(user_ids):
cur.execute("SELECT ST_X(lat_lon::geometry),ST_Y(lat_lon::geometry), id FROM tracker WHERE user_id=%s", (user_id,))
resultset = tuple(cur.fetchall())
if len(resultset) < 400:
continue
X = np.array([list(x[:-1]) for x in resultset])
##############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.02, min_samples=30, metric=haversine).fit(X)
labels = db.labels_
res_dict = defaultdict(lambda: [])
for label, coordinates in zip(labels, X):
if label == -1:
continue
res_dict[str(label)].append(tuple(coordinates))
# Plot it on a map using folium
map_osm = folium.Map(location=[55.622534,12.080729], zoom_start=12)
for i, (k, v) in enumerate(res_dict.items()):
for coord in v:
map_osm.simple_marker(coord,marker_color=colors[int(i)])
avg = np.mean(v, axis=0)
print(avg)
map_osm.circle_marker(location=list(avg), radius=20
,line_color='#3186cc',
fill_color=colors[int(i)])
map_osm.create_map(path=str(user_id) + '.html')
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment