Skip to content

Instantly share code, notes, and snippets.

@airbob
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save airbob/9246248 to your computer and use it in GitHub Desktop.
Save airbob/9246248 to your computer and use it in GitHub Desktop.
python read&processing&write big files

background

Below is input File format(*.txt):

userID month date hour totalTW totalQs result
21535110 05 01 02 3 2 1
21535110 05 01 03 3 2 1
21535110 05 01 06 1 0 0
21535110 05 02 02 1 0 0
21535110 05 03 05 3 2 0
21535112 05 01 05 1 1 1

totally there are 28,000,000 lines in the file, and I have 6 this kind of files.

object

write script to process the input data, to:
for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour.
lets say: there are lines like this(year is 2012):

userID month date hour totalTW totalQs result
21535110 05 01 02 3 2 1
21535110 05 08 02 2 1 0

then this 2 data points should sum since they both belong to tue of May and hour is 02

userID month day hour totalTW totalQs result
21535110 05 Tue 02 5 3 1

Problem

the week.py script I added in this gist is working, the problem is, it seems too slow.
I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 (about 10% ! only)
Is there any way to optimize this script?

#!/usr/bin/python
import os
import sys
import csv
import re
import string
import time
import datetime
'''
This is modified version of week.py
I raised this question in http://v2ex.com/t/102160 and
thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake)
with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now.
'''
def main():
start_time = time.time()
weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
inputfilename = "./test_refine/test.txt"
outputfilename = "test_week/" + inputfilename[14:]
print outputfilename
open_file = open(inputfilename, 'r')
contents = open_file.readlines()
to_file = open(outputfilename, 'w')
i = 0
totalLines = len(contents)
totalLines = int(totalLines)
while i < totalLines:
outputCONTENT = ""
print i
if ( i == totalLines-1):
print time.time()-start_time , "seconds"
return
if (i>0):
lineString = contents[i]
user = lineString.split()[0]
j = i
nextFlag = 1
while (nextFlag == 1 and ( j < totalLines )):
tempString = contents[j]
user2 = tempString.split()[0]
if (user != user2):
nextFlag = 0
j = j + 1
markIndex = j
## do the main check
totalTW = {}
totalQS = {}
totalResult = {}
for z in range(i,markIndex):
tempString = contents[z]
tweetmonth = tempString.split()[1]
tweetday = tempString.split()[2]
tweethour = tempString.split()[3]
tweetTW = tempString.split()[4]
tweetQS = tempString.split()[5]
tweetResult = tempString.split()[6]
tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour)
if key in totalTW:
totalTW[key]+=int(tweetTW)
totalQS[key]+=int(tweetQS)
totalResult[key]+=int(tweetResult)
else:
totalTW[key]=int(tweetTW)
totalQS[key]=int(tweetQS)
totalResult[key]=int(tweetResult)
for month in range(5,13):
for day in weekday:
for hour in range(0,24):
key = "%02d%s%02d" % ( month, day, hour)
if key in totalTW:
lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key])
outputCONTENT = outputCONTENT + lineoutput
else:
lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0)
outputCONTENT = outputCONTENT + lineoutput
## to_file.write(lineoutput)
i = markIndex-1
else:
## to_file.write(contents[0])
outputCONTENT = outputCONTENT + contents[0]
i = i + 1
to_file.write(outputCONTENT)
to_file.close()
open_file.close()
if __name__ == "__main__":
main()
#!/usr/bin/python
import os
import sys
import csv
import re
import string
import time
import datetime
'''
weekday of each month
'''
def main():
weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
## read stats file and filter not existing users
inputfilename = "input.txt"
outputfilename = "output.txt"
print outputfilename
open_file = open(inputfilename, 'r')
contents = open_file.readlines()
to_file = open(outputfilename, 'w')
i = 0
totalLines = len(contents)
totalLines = int(totalLines)
print "going to while loop"
while i < totalLines:
outputCONTENT = ""
print i
if ( i == totalLines-1):
return
if (i>0):
lineString = contents[i]
user = lineString.split()[0]
j = i
nextFlag = 1
while (nextFlag == 1 and ( j < totalLines )):
tempString = contents[j]
user2 = tempString.split()[0]
if (user != user2):
nextFlag = 0
j = j + 1
markIndex = j
for month in range(5,13):
for day in weekday:
for hour in range ( 0, 24):
## print "%s-%s-%s-%s" % (user,month, day , hour)
totalTW = 0
totalQS = 0
totalResult = 0
for z in range(i,markIndex):
tempString = contents[z]
tweetmonth = tempString.split()[1]
tweetday = tempString.split()[2]
tweethour = tempString.split()[3]
tweetTW = tempString.split()[4]
tweetQS = tempString.split()[5]
tweetResult = tempString.split()[6]
tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
if ( day in dayOfWeek and hour == int(tweethour) and month ==int(tweetmonth) ):
totalTW += int(tweetTW)
totalQS += int(tweetQS)
totalResult += int(tweetResult)
lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW,totalQS,totalResult)
## to_file.write(lineoutput)
outputCONTENT = outputCONTENT + lineoutput
i = markIndex-1
else:
## to_file.write(contents[0])
outputCONTENT = outputCONTENT + contents[0]
i = i + 1
to_file.write(outputCONTENT)
to_file.close()
open_file.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment