Skip to content

Instantly share code, notes, and snippets.

@mtigas mtigas/twitter_ebook.py forked from argv0/gist:3627514
Last active Dec 11, 2015

Embed
What would you like to do?
oh god what have i done
"""
usage: twitter_ebook.py [-h] [--num NUM] archive_dir
Generate some tweets based on the CSV in a Twitter archive download
positional arguments:
archive_dir path your unzipped Twitter archive
optional arguments:
-h, --help show this help message and exit
--num NUM number of lines to generate
requires python 2.7 (i think)
mac os x howto:
put this .py file in any directory
put your archive "tweets.zip" file in same directory
unzip "tweets.zip" there (so it unzips into a "tweets" subdirectory)
open Terminal, `cd` to the directory you saved this to, run:
python twitter_ebook.py --num 2000 tweets
???
profit
"""
import argparse
import csv
import os
import os.path
import random
import re
parser = argparse.ArgumentParser(description='Generate some tweets based ' +\
'on the CSV in a Twitter archive download')
parser.add_argument('archive_dir',
help='path your unzipped Twitter archive')
parser.add_argument('--num',
type=int, default=5,
help='number of lines to generate')
NONWORD = "\n"
STARTKEY = NONWORD, NONWORD
MAXGEN=1000
NONWORD_RE = re.compile(r'^["\'\.,";:/?!@#$%^&*()_+-={}\[\]]+$')
# ['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'timestamp', 'source', 'text', 'expanded_urls']
def skipword(word):
return (word.lower() == "rt") or ("@" in word) or ("#" in word) or\
("http://" in word) or ("https://" in word) or\
bool(NONWORD_RE.match(word))
class MarkovChainer(object):
def __init__(self, archive_dir):
self.state = dict()
self.wordcaps = dict()
csv_dir = os.path.join(archive_dir, 'data', 'csv')
files = os.listdir(csv_dir)
for filename in filter(lambda fname: fname.endswith('.csv'), files):
try:
with open(os.path.join(csv_dir, filename), 'rb') as csvf:
csv_r = csv.reader(csvf)
csv_r.next() # skip_header
for row in csv_r:
self.input(row[7])
except:
pass
def input(self, input):
word1, word2 = STARTKEY
for word3 in input.split():
o_word = word3.replace("?", " ").strip()
lower_word = o_word.lower()
if not skipword(lower_word):
self.state.setdefault((word1, word2), list()).append(lower_word)
word1, word2 = word2, lower_word
self.wordcaps.setdefault(lower_word, list()).append(o_word)
self.state.setdefault((word1, word2), list()).append(NONWORD)
def output(self):
output = list()
out_str = ""
word1, word2 = STARTKEY
for i in range(MAXGEN):
word3 = random.choice(self.state[(word1,word2)])
if word3 == NONWORD: break
#output.append(word3)
real_word = random.choice(self.wordcaps.get(word3, [word3,]))
output.append(real_word)
word1, word2 = word2, word3
out_str = " ".join(output)
if len(out_str) > 110:
return out_str
return out_str
if __name__ == "__main__":
args = parser.parse_args()
c = MarkovChainer(args.archive_dir)
for x in xrange(args.num):
out = c.output().strip()
if out:
print out
if (x+1) < args.num:
print
@mtigas

This comment has been minimized.

Copy link
Owner Author

mtigas commented Jan 23, 2013

q: how do reporters cover crime Now second Q: what is cat What is CAT WHAT is Cat what is HAPPENING I just caught

So just unveiled lorem ipsum generator, except "Somehow It's so Fitting That it fixed the auto-exposure After that.)

Descalso delivers a single! look, I did non-programming journalism

so Nebraska is playing Kansas Tonight

ProPublica is better than 9 !", "where was 16 !", "HOW Can you do business online, keep that number high.

What A fucking custard pie. fuck

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.