Skip to content

Instantly share code, notes, and snippets.

@adam704a
Last active August 29, 2015 14:22
Show Gist options
  • Save adam704a/9f556e831b97674b9b4e to your computer and use it in GitHub Desktop.
Save adam704a/9f556e831b97674b9b4e to your computer and use it in GitHub Desktop.
save tweet text to file
from pymongo import MongoClient
import datetime, time, string
import codecs
import itertools
#c = MongoClient()
c = MongoClient("mongodb://[put your user name here]:[put your password here]@sif.rtp.rti.org")
db = c.mj_tweets
collection = db.mj_sample
# year,month, day, hour
start_time = datetime.datetime(2014, 11, 1)
end_time = datetime.datetime(2014, 11, 29)
my_exclusions = set('RT rt sunglass Sunglass SEO S.E.O. searchengineop'.split())
start = time.time()
print("start")
# convert to list to pull down the whole resultset at once
# May have issues returning exceptionally large result sets
cursor = list(collection.find( {"created_at" : { "$lte" : end_time, "$gte": start_time}}, {'text': 1, '_id': 0}))
# Note: the {'text': 1, '_id': 0} projection, this returns only the Text field, reducing a tremendous amount bandwidth and text to process
print("Time to get tweets:" + str(time.time() - start))
print("loading " + str(len(cursor)) + " tweets")
start_append = time.time()
tweet_list = []
print("start appending")
punc = set(string.punctuation)
punc.discard('#')
punc.discard('@')
# clean function that does everything on one pass
def clean(record_text):
# remove newlines etc
tweet_words = [word.strip(' \t\n\r\c') for word in record_text['text'].split()]
# Remove hyperlinks
tweet_words = [word for word in tweet_words if not 'http' in word]
# Remove usernames
tweet_words = [word for word in tweet_words if not '@' in word]
# remove punctuation
tweet_words = [word for word in tweet_words if word not in punc]
tweet_sring = " ".join(tweet_words)
if any(word in tweet_sring for word in my_exclusions):
return None
else:
return tweet_sring
# Append the cleaned tweets to a set to ensure no dupes and only one None
tweet_list = {clean(t) for t in cursor}
print("time to append:" + str(time.time() - start))
print('write file')
with codecs.open("output.txt", "w", encoding='utf8') as f:
start_write = time.time()
for tweet in tweet_list:
f.write(str(tweet)+'\n')
print("time to write file:" + str( time.time() - start_write))
end = time.time()
print(end - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment