Skip to content

Instantly share code, notes, and snippets.

@chengjun
Created August 1, 2014 16:09
Show Gist options
  • Save chengjun/f1e82a3de164e1492b4d to your computer and use it in GitHub Desktop.
Save chengjun/f1e82a3de164e1492b4d to your computer and use it in GitHub Desktop.
‘’‘
step3: delte duplicates, sort data and save data
’‘’
import os
import glob
from collections import defaultdict
path = "D:/renren/friends_sorted/"
ads = glob.glob(path + "*")
def saveData(file_name, trunk):
with open(file_name, 'a') as g:
for t in trunk:
t = str(t[0])+'\t'+t[1]+'\t'+t[2]
g.write(str(t)+"\n")
#print t
def sortData(i):
upt = []
with open(i) as f:
lines = f.readlines()
uniqueLines = set(lines)
for line in uniqueLines:
try:
user, friend, time = line.strip().split('\t')
user = int(user)
upt.append([user, friend, time])
except:
pass
upt = sorted(upt, key = lambda x: x[0])
return upt
def transformData(site): # site = sites[0]
# 1. sort data
upt = sortData(site)
# save data
ids = site.split('\\')[1]
file_name = 'D:/chengjun/renren/firends_unique/' + ids
saveData(file_name, upt)
#print ads[0]
#transformData(ads[0])
for ad in ads:
print ad
transformData(ad)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment