Last active
August 29, 2015 14:22
-
-
Save adam704a/9f556e831b97674b9b4e to your computer and use it in GitHub Desktop.
save tweet text to file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pymongo import MongoClient | |
import datetime, time, string | |
import codecs | |
import itertools | |
#c = MongoClient() | |
c = MongoClient("mongodb://[put your user name here]:[put your password here]@sif.rtp.rti.org") | |
db = c.mj_tweets | |
collection = db.mj_sample | |
# year,month, day, hour | |
start_time = datetime.datetime(2014, 11, 1) | |
end_time = datetime.datetime(2014, 11, 29) | |
my_exclusions = set('RT rt sunglass Sunglass SEO S.E.O. searchengineop'.split()) | |
start = time.time() | |
print("start") | |
# convert to list to pull down the whole resultset at once | |
# May have issues returning exceptionally large result sets | |
cursor = list(collection.find( {"created_at" : { "$lte" : end_time, "$gte": start_time}}, {'text': 1, '_id': 0})) | |
# Note: the {'text': 1, '_id': 0} projection, this returns only the Text field, reducing a tremendous amount bandwidth and text to process | |
print("Time to get tweets:" + str(time.time() - start)) | |
print("loading " + str(len(cursor)) + " tweets") | |
start_append = time.time() | |
tweet_list = [] | |
print("start appending") | |
punc = set(string.punctuation) | |
punc.discard('#') | |
punc.discard('@') | |
# clean function that does everything on one pass | |
def clean(record_text): | |
# remove newlines etc | |
tweet_words = [word.strip(' \t\n\r\c') for word in record_text['text'].split()] | |
# Remove hyperlinks | |
tweet_words = [word for word in tweet_words if not 'http' in word] | |
# Remove usernames | |
tweet_words = [word for word in tweet_words if not '@' in word] | |
# remove punctuation | |
tweet_words = [word for word in tweet_words if word not in punc] | |
tweet_sring = " ".join(tweet_words) | |
if any(word in tweet_sring for word in my_exclusions): | |
return None | |
else: | |
return tweet_sring | |
# Append the cleaned tweets to a set to ensure no dupes and only one None | |
tweet_list = {clean(t) for t in cursor} | |
print("time to append:" + str(time.time() - start)) | |
print('write file') | |
with codecs.open("output.txt", "w", encoding='utf8') as f: | |
start_write = time.time() | |
for tweet in tweet_list: | |
f.write(str(tweet)+'\n') | |
print("time to write file:" + str( time.time() - start_write)) | |
end = time.time() | |
print(end - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment