adam704a/export-bar.py

## export-bar.py
from pymongo import MongoClient
import datetime, time, string
import codecs
import itertools

#c = MongoClient()
c = MongoClient("mongodb://[put your user name here]:[put your password here]@sif.rtp.rti.org")
db = c.mj_tweets
collection = db.mj_sample
# year,month, day, hour
start_time = datetime.datetime(2014, 11, 1)
end_time = datetime.datetime(2014, 11, 29)
my_exclusions = set('RT rt sunglass Sunglass SEO S.E.O. searchengineop'.split())

start = time.time()
print("start")

# convert to list to pull down the whole resultset at once
# May have issues returning exceptionally large result sets
cursor = list(collection.find( {"created_at" : { "$lte" : end_time, "$gte": start_time}}, {'text': 1, '_id': 0}))
# Note: the {'text': 1, '_id': 0} projection, this returns only the Text field, reducing a tremendous amount bandwidth and text to process


print("Time to get tweets:" + str(time.time() - start))
print("loading " + str(len(cursor)) + " tweets")

start_append = time.time()
tweet_list = []
print("start appending")


punc = set(string.punctuation)
punc.discard('#')
punc.discard('@')

# clean function that does everything on one pass
def clean(record_text):

    # remove newlines etc
    tweet_words = [word.strip(' \t\n\r\c') for word in record_text['text'].split()]

    # Remove hyperlinks
    tweet_words = [word for word in tweet_words if not 'http' in word]

    # Remove usernames
    tweet_words = [word for word in tweet_words if not '@' in word]

    # remove punctuation
    tweet_words = [word for word in tweet_words if word not in punc]

    tweet_sring = " ".join(tweet_words)

    if any(word in tweet_sring for word in my_exclusions):
        return None
    else:
        return tweet_sring

# Append the cleaned tweets to a set to ensure no dupes and only one None
tweet_list = {clean(t) for t in cursor}

print("time to append:" + str(time.time() - start))

print('write file')
with codecs.open("output.txt", "w", encoding='utf8') as f:
    start_write = time.time()
    for tweet in tweet_list:
        f.write(str(tweet)+'\n')
    print("time to write file:" + str( time.time() - start_write))

end = time.time()

print(end - start)
	from pymongo import MongoClient
	import datetime, time, string
	import codecs
	import itertools

	#c = MongoClient()
	c = MongoClient("mongodb://[put your user name here]:[put your password here]@sif.rtp.rti.org")
	db = c.mj_tweets
	collection = db.mj_sample
	# year,month, day, hour
	start_time = datetime.datetime(2014, 11, 1)
	end_time = datetime.datetime(2014, 11, 29)
	my_exclusions = set('RT rt sunglass Sunglass SEO S.E.O. searchengineop'.split())

	start = time.time()
	print("start")

	# convert to list to pull down the whole resultset at once
	# May have issues returning exceptionally large result sets
	cursor = list(collection.find( {"created_at" : { "$lte" : end_time, "$gte": start_time}}, {'text': 1, '_id': 0}))
	# Note: the {'text': 1, '_id': 0} projection, this returns only the Text field, reducing a tremendous amount bandwidth and text to process


	print("Time to get tweets:" + str(time.time() - start))
	print("loading " + str(len(cursor)) + " tweets")

	start_append = time.time()
	tweet_list = []
	print("start appending")


	punc = set(string.punctuation)
	punc.discard('#')
	punc.discard('@')

	# clean function that does everything on one pass
	def clean(record_text):

	# remove newlines etc
	tweet_words = [word.strip(' \t\n\r\c') for word in record_text['text'].split()]

	# Remove hyperlinks
	tweet_words = [word for word in tweet_words if not 'http' in word]

	# Remove usernames
	tweet_words = [word for word in tweet_words if not '@' in word]

	# remove punctuation
	tweet_words = [word for word in tweet_words if word not in punc]

	tweet_sring = " ".join(tweet_words)

	if any(word in tweet_sring for word in my_exclusions):
	return None
	else:
	return tweet_sring

	# Append the cleaned tweets to a set to ensure no dupes and only one None
	tweet_list = {clean(t) for t in cursor}

	print("time to append:" + str(time.time() - start))

	print('write file')
	with codecs.open("output.txt", "w", encoding='utf8') as f:
	start_write = time.time()
	for tweet in tweet_list:
	f.write(str(tweet)+'\n')
	print("time to write file:" + str( time.time() - start_write))

	end = time.time()

	print(end - start)