Skip to content

Instantly share code, notes, and snippets.

@chengjun
Created August 1, 2014 08:42
Show Gist options
  • Save chengjun/ea7f7e16d4628f3b58d3 to your computer and use it in GitHub Desktop.
Save chengjun/ea7f7e16d4628f3b58d3 to your computer and use it in GitHub Desktop.
'''
# Step2: split the duplicated data into about 2000+ files by user ids
# to prepare for deleting the duplicated ties
'''
from collections import defaultdict
path = "D:/renren/"
bigfile = open(path + "friends_all.txt")
chunkSize = 100000
def splitData(f):
n = 0
E = defaultdict(lambda:[])
for line in f:
From, To, time = line.strip().split('\t')
n += 1
if n%1000 == 0:
print n, From, To, time
From = int(From)
To = int(To)
From, To = sorted([From, To])
record = str(From) + '\t' + str(To) + '\t' + time
file_save = path + "friends_sorted3/" + str(From/10000)
E[file_save].append(record)
for key in E.keys():
print key
with open(key,'a') as p:
for record in E[key]:
p.write(record+"\n")
chunk = bigfile.readlines(chunkSize)
while chunk:
splitData(chunk)
chunk = bigfile.readlines(chunkSize)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment