airbob/readme.md

## readme.md

      
    Raw
  

              readme.md
            
          
    background

Below is input File format(*.txt):


userID
month
date
hour
totalTW
totalQs
result


21535110
05
01
02
3
2
1


21535110
05
01
03
3
2
1


21535110
05
01
06
1
0
0


21535110
05
02
02
1
0
0


21535110
05
03
05
3
2
0


21535112
05
01
05
1
1
1


totally there are 28,000,000 lines in the file, and I have 6 this kind of files.
object

write script to process the input data, to: 

for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. 

lets say:
there are lines like this(year is 2012): 


userID
month
date
hour
totalTW
totalQs
result


21535110
05
01
02
3
2
1


21535110
05
08
02
2
1
0


then this 2 data points should sum since they both belong to tue of May and hour is 02 


userID
month
day
hour
totalTW
totalQs
result


21535110
05
Tue
02
5
3
1


Problem

the week.py script I added in this gist is working, the problem is, it seems too slow.  

I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 (about 10% ! only) 

Is there any way to optimize this script?  


## week-modified.py
#!/usr/bin/python
import os
import sys
import csv
import re
import string
import time
import datetime
'''
This is modified version of week.py
I raised this question in http://v2ex.com/t/102160 and
thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake)
with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now.
'''

def main():
    start_time = time.time()
    weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
    inputfilename = "./test_refine/test.txt"
    outputfilename = "test_week/" + inputfilename[14:]
    print outputfilename
    open_file = open(inputfilename, 'r')
    contents = open_file.readlines()
    to_file = open(outputfilename, 'w')
    i = 0
    totalLines = len(contents)
    totalLines = int(totalLines)
    while i < totalLines:
        outputCONTENT = ""
        print i
        if ( i == totalLines-1):
            print time.time()-start_time , "seconds"
            return
        if (i>0):
            lineString = contents[i]
            user = lineString.split()[0]
            j = i
            nextFlag = 1
            while (nextFlag == 1 and ( j < totalLines )):
                tempString = contents[j]
                user2 = tempString.split()[0]
                if (user != user2):
                    nextFlag = 0
                j = j + 1
            markIndex = j
            ## do the main check
            totalTW = {}
            totalQS = {}
            totalResult =  {}
            for z in range(i,markIndex):
                tempString = contents[z]
                tweetmonth = tempString.split()[1]
                tweetday = tempString.split()[2]
                tweethour = tempString.split()[3]
                tweetTW = tempString.split()[4]
                tweetQS = tempString.split()[5]
                tweetResult = tempString.split()[6]
                tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
                dayOfWeek =  datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
                key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour)
                if key in totalTW:
                    totalTW[key]+=int(tweetTW)
                    totalQS[key]+=int(tweetQS)
                    totalResult[key]+=int(tweetResult)
                else:
                    totalTW[key]=int(tweetTW)
                    totalQS[key]=int(tweetQS)
                    totalResult[key]=int(tweetResult)
            for month in range(5,13):
                for day in weekday:
                    for hour in range(0,24):
                        key = "%02d%s%02d" % ( month, day, hour)
                        if key in totalTW:
                            lineoutput =  "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key])
                            outputCONTENT = outputCONTENT + lineoutput
                        else:
                            lineoutput =  "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0)
                            outputCONTENT = outputCONTENT + lineoutput

            ## to_file.write(lineoutput)
            i = markIndex-1
        else:
            ## to_file.write(contents[0])
            outputCONTENT = outputCONTENT + contents[0]
            i = i + 1

        to_file.write(outputCONTENT)
    to_file.close()
    open_file.close()

if __name__ == "__main__":
    main()

## week.py
#!/usr/bin/python
import os
import sys
import csv
import re
import string
import time
import datetime
'''
weekday of each month
'''

def main():
    weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
    ## read stats file and filter not existing users
    inputfilename = "input.txt"
    outputfilename = "output.txt"
    print outputfilename
    open_file = open(inputfilename, 'r')
    contents = open_file.readlines()
    to_file = open(outputfilename, 'w')
    i = 0
    totalLines = len(contents)
    totalLines = int(totalLines)
    print "going to while loop"
    while i < totalLines:
        outputCONTENT = ""
        print i
        if ( i == totalLines-1):
            return
        if (i>0):
            lineString = contents[i]
            user = lineString.split()[0]
            j = i
            nextFlag = 1
            while (nextFlag == 1 and ( j < totalLines )):
                tempString = contents[j]
                user2 = tempString.split()[0]
                if (user != user2):
                    nextFlag = 0
                j = j + 1
            markIndex = j
            for month in range(5,13):
                for day in weekday:
                    for hour in range ( 0, 24):
                        ## print "%s-%s-%s-%s" % (user,month, day , hour)
                        totalTW = 0
                        totalQS = 0
                        totalResult =  0
                        for z in range(i,markIndex):
                            tempString = contents[z]
                            tweetmonth = tempString.split()[1]
                            tweetday = tempString.split()[2]
                            tweethour = tempString.split()[3]
                            tweetTW = tempString.split()[4]
                            tweetQS = tempString.split()[5]
                            tweetResult = tempString.split()[6]
                            tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
                            dayOfWeek =  datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
                            if ( day in dayOfWeek and hour == int(tweethour) and month ==int(tweetmonth) ):
                                totalTW += int(tweetTW)
                                totalQS += int(tweetQS)
                                totalResult += int(tweetResult)
                        lineoutput =  "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW,totalQS,totalResult)
                        ## to_file.write(lineoutput)
                        outputCONTENT = outputCONTENT + lineoutput
            i = markIndex-1
        else:
            ## to_file.write(contents[0])
            outputCONTENT = outputCONTENT + contents[0]
            i = i + 1

        to_file.write(outputCONTENT)
    to_file.close()
    open_file.close()

if __name__ == "__main__":
    main()
userID	month	date	hour	totalTW	totalQs	result
21535110	05	01	02	3	2	1
21535110	05	01	03	3	2	1
21535110	05	01	06	1	0	0
21535110	05	02	02	1	0	0
21535110	05	03	05	3	2	0
21535112	05	01	05	1	1	1
	#!/usr/bin/python
	import os
	import sys
	import csv
	import re
	import string
	import time
	import datetime
	'''
	This is modified version of week.py
	I raised this question in http://v2ex.com/t/102160 and
	thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake)
	with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now.
	'''

	def main():
	start_time = time.time()
	weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
	inputfilename = "./test_refine/test.txt"
	outputfilename = "test_week/" + inputfilename[14:]
	print outputfilename
	open_file = open(inputfilename, 'r')
	contents = open_file.readlines()
	to_file = open(outputfilename, 'w')
	i = 0
	totalLines = len(contents)
	totalLines = int(totalLines)
	while i < totalLines:
	outputCONTENT = ""
	print i
	if ( i == totalLines-1):
	print time.time()-start_time , "seconds"
	return
	if (i>0):
	lineString = contents[i]
	user = lineString.split()[0]
	j = i
	nextFlag = 1
	while (nextFlag == 1 and ( j < totalLines )):
	tempString = contents[j]
	user2 = tempString.split()[0]
	if (user != user2):
	nextFlag = 0
	j = j + 1
	markIndex = j
	## do the main check
	totalTW = {}
	totalQS = {}
	totalResult = {}
	for z in range(i,markIndex):
	tempString = contents[z]
	tweetmonth = tempString.split()[1]
	tweetday = tempString.split()[2]
	tweethour = tempString.split()[3]
	tweetTW = tempString.split()[4]
	tweetQS = tempString.split()[5]
	tweetResult = tempString.split()[6]
	tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
	dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
	key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour)
	if key in totalTW:
	totalTW[key]+=int(tweetTW)
	totalQS[key]+=int(tweetQS)
	totalResult[key]+=int(tweetResult)
	else:
	totalTW[key]=int(tweetTW)
	totalQS[key]=int(tweetQS)
	totalResult[key]=int(tweetResult)
	for month in range(5,13):
	for day in weekday:
	for hour in range(0,24):
	key = "%02d%s%02d" % ( month, day, hour)
	if key in totalTW:
	lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key])
	outputCONTENT = outputCONTENT + lineoutput
	else:
	lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0)
	outputCONTENT = outputCONTENT + lineoutput

	## to_file.write(lineoutput)
	i = markIndex-1
	else:
	## to_file.write(contents[0])
	outputCONTENT = outputCONTENT + contents[0]
	i = i + 1

	to_file.write(outputCONTENT)
	to_file.close()
	open_file.close()

	if __name__ == "__main__":
	main()