Skip to content

Instantly share code, notes, and snippets.

@chengjun
Created August 2, 2014 05:07
Show Gist options
  • Save chengjun/caeff350a0d73ba80760 to your computer and use it in GitHub Desktop.
Save chengjun/caeff350a0d73ba80760 to your computer and use it in GitHub Desktop.
import sys
from collections import defaultdict, Counter
import glob
reload(sys)
sys.setdefaultencoding('utf8')
path = "D:/chengjun/renren/"
userUniversity = path + "user_university.txt"
with open(userUniversity, 'r') as f:
E = defaultdict(lambda:[])
F = defaultdict(int)
errorNum = 0
for line in f:
try:
user, university = line.strip().split('\t')
E[user].append(university)
F[university] += 1
except:
errorNum += 1
print line
pass
print errorNum
d = Counter(F)
top100 = []
for k,v in d.most_common(100):
top100.append(k)
ads = glob.glob(path + "friends_unique/" + "*")
ad = ads[-1]
def universityFlow(ad):
with open(ad, 'r') as f:
lines = f.readlines()
G = []
for line in lines:
try:
user, friend, time = line.strip().split('\t')
u1 = E[user][0]
u2 = E[friend][0]
if u1 in top100 and u2 in top100:
record = u1 + '\t' + u2 + '\t' + time
G.append(record)
except:
pass
with open(path + 'friends_univeristy_top100.txt', 'a') as uf:
for i in G:
uf.write(i + '\n')
for ad in ads:
print ad
universityFlow(ad)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment