chengjun/split_renrendata_by_chunks.py

## split_renrendata_by_chunks.py
'''
# Step2: split the duplicated data into about 2000+ files by user ids
# to prepare for deleting the duplicated ties
'''

from collections import defaultdict

path = "D:/renren/"

bigfile = open(path + "friends_all.txt")
chunkSize = 100000

def splitData(f):
    n = 0
    E = defaultdict(lambda:[])
    for line in f:
        From, To, time = line.strip().split('\t')
        n += 1
        if n%1000 == 0:
            print n, From, To, time
        From = int(From)
        To = int(To)
        From, To = sorted([From, To])
        record = str(From) + '\t' + str(To) + '\t' + time
        file_save = path + "friends_sorted3/" + str(From/10000)
        E[file_save].append(record)
    for key in E.keys():
        print key
        with open(key,'a') as p:
            for record in E[key]:
                p.write(record+"\n")


chunk = bigfile.readlines(chunkSize)
while chunk:
    splitData(chunk)
    chunk = bigfile.readlines(chunkSize)
	'''
	# Step2: split the duplicated data into about 2000+ files by user ids
	# to prepare for deleting the duplicated ties
	'''

	from collections import defaultdict

	path = "D:/renren/"

	bigfile = open(path + "friends_all.txt")
	chunkSize = 100000

	def splitData(f):
	n = 0
	E = defaultdict(lambda:[])
	for line in f:
	From, To, time = line.strip().split('\t')
	n += 1
	if n%1000 == 0:
	print n, From, To, time
	From = int(From)
	To = int(To)
	From, To = sorted([From, To])
	record = str(From) + '\t' + str(To) + '\t' + time
	file_save = path + "friends_sorted3/" + str(From/10000)
	E[file_save].append(record)
	for key in E.keys():
	print key
	with open(key,'a') as p:
	for record in E[key]:
	p.write(record+"\n")


	chunk = bigfile.readlines(chunkSize)
	while chunk:
	splitData(chunk)
	chunk = bigfile.readlines(chunkSize)