Skip to content

Instantly share code, notes, and snippets.

@chribsen
Created March 27, 2016 12:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chribsen/99a4e1dadbb73ebd5631 to your computer and use it in GitHub Desktop.
Save chribsen/99a4e1dadbb73ebd5631 to your computer and use it in GitHub Desktop.
Computes the jaccard similarity of the places visited by each user for all user pairs.
import psycopg2
from scipy.spatial.distance import jaccard
conn_dtu = psycopg2.connect(<connstring>)
cur_dtu = conn_dtu.cursor()
# Retrieve places visited for each user in the user pair and aggregate them into an array.
cur_dtu.execute("""
select count(*) FROM (select user_a, ( SELECT array_agg(place_id) FROM derived_places_visited WHERE user_id=dff.user_a) as places_a,
user_b, ( SELECT array_agg(place_id) FROM derived_places_visited WHERE user_id=dff.user_b) as places_b from derived_friend_features as dff
where ( SELECT array_agg(place_id) FROM derived_places_visited WHERE user_id=dff.user_a) is not null
and ( SELECT array_agg(place_id) FROM derived_places_visited WHERE user_id=dff.user_b) is not null) as d
""")
# Iterate over all the user pairs and compute the Jaccard similarity of the places that
# they have visited.
for i, (user_a, places_a, user_b, places_b) in enumerate(cur_dtu.fetchall()):
# Distance metric: http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.jaccard.html
# jaccard() returns a dissimilarity, thus we need to substract by 1 to get the similarity.
jac_similarity = 1 - jaccard(places_a, places_b)
# Update the feature
cur_dtu.execute("""UPDATE derived_friend_features SET places_jac_similarity=%s WHERE user_a=%s AND user_b=%s""",
(jac_similarity, user_a, user_b,))
places_visited_together = list(set(places_a).intersection(set(places_b)))
# Insert the places visited together: Just in case we need this later.
cur_dtu.execute("""INSERT INTO tmp_places_visited_together (user_a, user_b, place_ids) VALUES (%s,%s,%s)""",
(user_a, user_b, places_visited_together))
if i % 500 == 0:
print('Saving...')
conn_dtu.commit()
conn_dtu.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment