Created
March 21, 2016 13:32
-
-
Save chribsen/e373c83a081f6ae8b3f4 to your computer and use it in GitHub Desktop.
Computes spatial clusters using haversine distance and plots the output on a map. Meant for use on RF data set. Parameters haven't been tuned yet.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import numpy as np | |
from sklearn.cluster import DBSCAN, KMeans | |
from sklearn.preprocessing import StandardScaler | |
from data import points | |
import psycopg2 | |
import matplotlib.pyplot as plt | |
import sys | |
from collections import defaultdict | |
import math, json | |
import folium | |
EARTH_CIRCUMFERENCE = 6378137 # earth circumference in meters | |
colors = ['green', 'red', 'yellow', 'blue', 'black', 'white', 'gray', 'pink', 'cloud'] | |
# Distance formula has been obtained from this thread: | |
# http://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points | |
from math import radians, cos, sin, asin, sqrt | |
def haversine(lat_lon1, lat_lon2): | |
""" | |
Calculate the great circle distance between two points | |
on the earth (specified in decimal degrees) | |
""" | |
# Return huge error if the tuple contains more than 2 values | |
# This is necessary due to an error in the output | |
try: | |
# convert decimal degrees to radians | |
lat1, lon1 = tuple(lat_lon1) | |
lat2, lon2 = tuple(lat_lon2) | |
except ValueError as e: | |
print('failed...') | |
return 999 # return a huge distance, so it's not considered as a part of the cluster | |
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) | |
# This thread | |
#http://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points | |
R = 6371 #radius of the earth in km | |
x = (lon2 - lon1) * cos( 0.5*(lat2+lat1) ) | |
y = lat2 - lat1 | |
d = R * sqrt( x*x + y*y ) | |
return d | |
conn = psycopg2.connect(database="thesis_v2", user="postgres", password="password", host="localhost") | |
cur = conn.cursor() | |
cur.execute("SELECT id FROM users") | |
user_ids = cur.fetchall() | |
for x, user_id in enumerate(user_ids): | |
cur.execute("SELECT ST_X(lat_lon::geometry),ST_Y(lat_lon::geometry), id FROM tracker WHERE user_id=%s", (user_id,)) | |
resultset = tuple(cur.fetchall()) | |
if len(resultset) < 400: | |
continue | |
X = np.array([list(x[:-1]) for x in resultset]) | |
############################################################################## | |
# Compute DBSCAN | |
db = DBSCAN(eps=0.02, min_samples=30, metric=haversine).fit(X) | |
labels = db.labels_ | |
res_dict = defaultdict(lambda: []) | |
for label, coordinates in zip(labels, X): | |
if label == -1: | |
continue | |
res_dict[str(label)].append(tuple(coordinates)) | |
# Plot it on a map using folium | |
map_osm = folium.Map(location=[55.622534,12.080729], zoom_start=12) | |
for i, (k, v) in enumerate(res_dict.items()): | |
for coord in v: | |
map_osm.simple_marker(coord,marker_color=colors[int(i)]) | |
avg = np.mean(v, axis=0) | |
print(avg) | |
map_osm.circle_marker(location=list(avg), radius=20 | |
,line_color='#3186cc', | |
fill_color=colors[int(i)]) | |
map_osm.create_map(path=str(user_id) + '.html') | |
# Number of clusters in labels, ignoring noise if present. | |
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) | |
print('Estimated number of clusters: %d' % n_clusters_) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment