Created
November 1, 2017 22:38
-
-
Save jessienewell/af2eecd182e06f16d75ea9b6f22fad90 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from IPython.display import Image, display | |
import networkx as nx | |
import numpy as np | |
import time | |
import datetime | |
#get all users that I am directly following | |
api.getSelfUsersFollowing() | |
result = api.LastJson | |
follow_relationships = [] | |
for user in tqdm(result['users']): | |
followed_user_id = user['pk'] | |
followed_user_name = user['full_name'] | |
follow_relationships.append((user_id, followed_user_id, me, followed_user_name)) | |
df_local = pd.DataFrame(follow_relationships, columns=['src_id','dst_id', 'src_name', 'dst_name']) | |
all_user_ids_local = np.unique(df_local[['src_id', 'dst_id']].values.reshape(1,-1)) | |
#grab all my likes from the past year | |
last_year = datetime.datetime.now() - datetime.timedelta(days=365) | |
now = datetime.datetime.now() | |
last_result_time = now | |
all_likes = [] | |
max_id = 0 | |
while last_result_time > last_year: | |
api.getLikedMedia(maxid=max_id) | |
results = api.LastJson | |
[all_likes.append(item) for item in results['items']] | |
max_id = results['items'][-1]['pk'] | |
last_result_time = pd.to_datetime(results['items'][-1]['taken_at'], unit='s') | |
like_counts = pd.Series([i['user']['pk'] for i in all_likes]).value_counts() | |
#calculate number of times I've liked each users post | |
for i in tqdm(like_counts.index): | |
if i in df_local['dst_id'].values: # only count likes from people I follow (naive but simple) | |
ind = df_local[(df_local['src_id'] == user_id) & (df_local['dst_id'] == i)].index[0] | |
if like_counts[i] is not None: | |
df_local = df_local.set_value(ind,'weight',like_counts[i]) | |
ind = df_local[df_local['weight'].isnull()].index | |
df_local = df_local.set_value(ind,'weight',0.5) | |
#create social graph and calculate pagerank | |
G = nx.from_pandas_dataframe(df_local, 'src_id', 'dst_id') | |
#calculate personalized pagerank | |
perzonalization_dict = dict(zip(G.nodes(), [0]*len(G.nodes()))) | |
perzonalization_dict[user_id] = 1 | |
ppr = nx.pagerank(G, personalization=perzonalization_dict) | |
#this may take a while if you follow a lot of people | |
urls = [] | |
taken_at = [] | |
num_likes = [] | |
num_comments = [] | |
page_rank = [] | |
users = [] | |
weight = [] | |
for user_id in tqdm(all_user_ids_local): | |
api.getUserFeed(user_id) | |
result = api.LastJson | |
if 'items' in result.keys(): | |
for item in result['items']: | |
if 'image_versions2' in item.keys(): #only grabbing pictures (no videos or carousels) | |
# make sure we can grab keys before trying to append | |
url = item['image_versions2']['candidates'][1]['url'] | |
taken = item['taken_at'] | |
try: | |
likes = item['like_count'] | |
except KeyError: | |
likes = 0 | |
try: | |
comments = item['comment_count'] | |
except KeyError: | |
comments = 0 | |
pr = ppr[item['user']['pk']] | |
user = item['user']['full_name'] | |
if user != me: #don't count myself! | |
urls.append(url) | |
taken_at.append(taken) | |
num_likes.append(likes) | |
num_comments.append(comments) | |
page_rank.append(pr) | |
users.append(user) | |
weight.append(df_local[df_local['dst_name'] == user]['weight'].values[0]) | |
#now we can make a dataframe with all of that information | |
scores_df = pd.DataFrame( | |
{'urls': urls, | |
'taken_at': taken_at, | |
'num_likes': num_likes, | |
'num_comments': num_comments, | |
'page_rank': page_rank, | |
'users': users, | |
'weight': weight | |
}) | |
#don't care about anything older than 1 week | |
oldest_time = int((datetime.datetime.now() | |
- datetime.timedelta(weeks = 1)).strftime('%s')) | |
scores_df = scores_df[scores_df['taken_at'] > oldest_time] | |
# /1e5 to help out with some machine precision (numbers get real small otherwise) | |
scores_df['time_score'] = np.exp(-(int(time.time()) - scores_df['taken_at'])/1e5) | |
scores_df['total_score'] = (np.log10(scores_df['num_comments']+2) * np.log10(scores_df['num_likes']+1) | |
* scores_df['page_rank'] * scores_df['time_score'] | |
* np.log(scores_df['weight']+1)) | |
# calculate top ten highest rated posts | |
top_ten = scores_df['total_score'].nlargest(10) | |
top_rows = scores_df.loc[top_ten.index].values | |
top_personal_img = [] | |
top_graph_img = [] | |
#display the feed | |
for row in top_rows: | |
img = Image(row[4], format='jpeg') | |
top_graph_img.append(img) | |
display(img) | |
top_personal_img.append(img) | |
print('taken_at: %s' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(row[3])) ) | |
print('number of likes: %s' % row[1]) | |
print('number of comments: %s' % row[0]) | |
print('page_rank: %s' % row[2]) | |
print(row[5]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment