Skip to content

Instantly share code, notes, and snippets.

@wzyboy
Last active August 29, 2015 14:16
Show Gist options
  • Save wzyboy/ccffa6893355c84b3475 to your computer and use it in GitHub Desktop.
Save wzyboy/ccffa6893355c84b3475 to your computer and use it in GitHub Desktop.
A script that helps to import Twitter's official archive.
#!/usr/bin/env python
USAGE = """
This script helps to import Twitter's official archive.
Usage: ./import.py tweets.csv
"""
"""
Original data structure:
['tweet_id',
'in_reply_to_status_id',
'in_reply_to_user_id',
'timestamp',
'source',
'text',
'retweeted_status_id',
'retweeted_status_user_id',
'retweeted_status_timestamp',
'expanded_urls']
Processed data structure:
['tweet_id',
'timestamp',
'text',
'expanded_urls']
"""
import sys
import csv
import time
from datetime import datetime
def main(inputfile):
# Get rows from input file
reader = csv.reader(inputfile)
rows = list(reader)
del rows[0] # Deletes header
# Rearrange columns
timeformat = '%Y-%m-%d %H:%M:%S %z'
for row in rows:
row[0] = int(row[0])
row[1] = datetime.astimezone(datetime.strptime(row[3], timeformat)).strftime(timeformat)
row[2] = row[5].replace('\n', '\\n').replace('\r', '\\r')
try:
row[3] = row[9]
except IndexError:
row[3] = ''
del row[4:]
rows.sort()
# Prepare for writing
lines = []
for row in rows:
lines.append(' '.join(map(str, row)).strip())
timestamp = str(int(time.time()))
outputfile = open('tweets-output-%s' % timestamp, 'a')
for line in lines:
print(line, file=outputfile)
outputfile.close()
if __name__ == '__main__':
try:
inputfile = open(sys.argv[1], 'r')
except IndexError:
print(USAGE)
sys.exit(1)
main(inputfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment