-
-
Save mtigas/4561224 to your computer and use it in GitHub Desktop.
oh god what have i done
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
usage: twitter_ebook.py [-h] [--num NUM] archive_dir | |
Generate some tweets based on the CSV in a Twitter archive download | |
positional arguments: | |
archive_dir path your unzipped Twitter archive | |
optional arguments: | |
-h, --help show this help message and exit | |
--num NUM number of lines to generate | |
requires python 2.7 (i think) | |
mac os x howto: | |
put this .py file in any directory | |
put your archive "tweets.zip" file in same directory | |
unzip "tweets.zip" there (so it unzips into a "tweets" subdirectory) | |
open Terminal, `cd` to the directory you saved this to, run: | |
python twitter_ebook.py --num 2000 tweets | |
??? | |
profit | |
""" | |
import argparse | |
import csv | |
import os | |
import os.path | |
import random | |
import re | |
parser = argparse.ArgumentParser(description='Generate some tweets based ' +\ | |
'on the CSV in a Twitter archive download') | |
parser.add_argument('archive_dir', | |
help='path your unzipped Twitter archive') | |
parser.add_argument('--num', | |
type=int, default=5, | |
help='number of lines to generate') | |
NONWORD = "\n" | |
STARTKEY = NONWORD, NONWORD | |
MAXGEN=1000 | |
NONWORD_RE = re.compile(r'^["\'\.,";:/?!@#$%^&*()_+-={}\[\]]+$') | |
# ['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'timestamp', 'source', 'text', 'expanded_urls'] | |
def skipword(word): | |
return (word.lower() == "rt") or ("@" in word) or ("#" in word) or\ | |
("http://" in word) or ("https://" in word) or\ | |
bool(NONWORD_RE.match(word)) | |
class MarkovChainer(object): | |
def __init__(self, archive_dir): | |
self.state = dict() | |
self.wordcaps = dict() | |
csv_dir = os.path.join(archive_dir, 'data', 'csv') | |
files = os.listdir(csv_dir) | |
for filename in filter(lambda fname: fname.endswith('.csv'), files): | |
try: | |
with open(os.path.join(csv_dir, filename), 'rb') as csvf: | |
csv_r = csv.reader(csvf) | |
csv_r.next() # skip_header | |
for row in csv_r: | |
self.input(row[7]) | |
except: | |
pass | |
def input(self, input): | |
word1, word2 = STARTKEY | |
for word3 in input.split(): | |
o_word = word3.replace("?", " ").strip() | |
lower_word = o_word.lower() | |
if not skipword(lower_word): | |
self.state.setdefault((word1, word2), list()).append(lower_word) | |
word1, word2 = word2, lower_word | |
self.wordcaps.setdefault(lower_word, list()).append(o_word) | |
self.state.setdefault((word1, word2), list()).append(NONWORD) | |
def output(self): | |
output = list() | |
out_str = "" | |
word1, word2 = STARTKEY | |
for i in range(MAXGEN): | |
word3 = random.choice(self.state[(word1,word2)]) | |
if word3 == NONWORD: break | |
#output.append(word3) | |
real_word = random.choice(self.wordcaps.get(word3, [word3,])) | |
output.append(real_word) | |
word1, word2 = word2, word3 | |
out_str = " ".join(output) | |
if len(out_str) > 110: | |
return out_str | |
return out_str | |
if __name__ == "__main__": | |
args = parser.parse_args() | |
c = MarkovChainer(args.archive_dir) | |
for x in xrange(args.num): | |
out = c.output().strip() | |
if out: | |
print out | |
if (x+1) < args.num: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
q: how do reporters cover crime Now second Q: what is cat What is CAT WHAT is Cat what is HAPPENING I just caught
So just unveiled lorem ipsum generator, except "Somehow It's so Fitting That it fixed the auto-exposure After that.)
Descalso delivers a single! look, I did non-programming journalism
so Nebraska is playing Kansas Tonight
ProPublica is better than 9 !", "where was 16 !", "HOW Can you do business online, keep that number high.
What A fucking custard pie. fuck