Skip to content

Instantly share code, notes, and snippets.

@voxels
Created December 11, 2013 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save voxels/7910957 to your computer and use it in GitHub Desktop.
Save voxels/7910957 to your computer and use it in GitHub Desktop.
Tweet Cleaner
import re
import datetime, time
import os
raw = open( "output.txt", 'r')
lines = []
noReturns = []
for r in raw:
n = re.sub('\n', '', r)
n = n.strip()
if( n.find('***') == -1 ):
noReturns.append( n )
for t in noReturns:
thisLine = t.split('|')
if( len( thisLine ) > 0):
if( thisLine[0].endswith("2013 ")):
lines.append(thisLine)
else:
lines[-1].extend( thisLine )
cleanLines = []
print "CLEANING LINES"
count = 0
for l in lines:
if( len( l ) == 11):
cleanRow = []
cleanRow.append(count)
for e in l:
rowElement = e.strip()
rowElement = rowElement.rstrip( '\r\n')
rowElement = rowElement.replace('\t', " ")
if( len( rowElement ) > 0):
cleanRow.append( str(rowElement) );
if( cleanRow[7].startswith( "RT " ) ):
cleanRow.append("TRUE")
else:
cleanRow.append("FALSE")
tempTime = cleanRow[1]
if( len(tempTime) != 30):
print( "BADLINE")
print( tempTime )
else:
timeElements = tempTime.split(" ")
tempTime = tempTime.replace("+0000", "")
if( len( timeElements ) != 6) :
print("BADLINE")
print( timeElements )
else:
t = datetime.datetime.strptime(tempTime, "%a %b %d %H:%M:%S %Y")
tS = time.mktime( t.timetuple() )
cleanRow.append(str(tS))
cleanLines.append(cleanRow)
count += 1
else:
newLine = []
newLine.append( l[0] )
newLine.append( l[1] )
newLine.append( l[2] )
newLine.append( l[3] )
newLine.append( l[4] )
newLine.append( l[5] )
newLine.append( ''.join( str(elem) for elem in l[6:len(l) -4] ) )
newLine.append( l[-4] )
newLine.append( l[-3] )
newLine.append( l[-2] )
newLine.append( l[-1] )
cleanRow = []
cleanRow.append(count)
for e in newLine:
rowElement = e.strip()
rowElement = rowElement.rstrip( '\r\n')
rowElement = rowElement.replace('\t', " ")
if( len( rowElement ) > 0):
cleanRow.append( str(rowElement) );
if( cleanRow[7].startswith( "RT " ) ):
cleanRow.append("TRUE")
else:
cleanRow.append("FALSE")
tempTime = cleanRow[1]
if( len(tempTime) != 30):
print( "BADLINE")
print( tempTime )
else:
timeElements = tempTime.split(" ")
tempTime = tempTime.replace("+0000", "")
if( len( timeElements ) != 6) :
print("BADLINE")
print( timeElements )
else:
t = datetime.datetime.strptime(tempTime, "%a %b %d %H:%M:%S %Y")
tS = time.mktime( t.timetuple() )
cleanRow.append(str(tS))
cleanLines.append(cleanRow)
count += 1
print "CLEANED"
with open( 'iPhone5s5c_firstClean.txt', 'r+') as f:
for c in cleanLines:
for e in c:
f.write( str(e) + "\t" )
f.write( "\n" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment