Load tweets into Spark and filter
# Extract tweets from MongoDB | |
allTweets = [] | |
for doc in db.tweets.find(): | |
allTweets.append(doc['tweet']) | |
# Load tweets into Spark for analysis | |
allTweetsRDD = sc.parallelize(allTweets, 8) | |
# Set up filter to only get tweets from the last week | |
DAYS_LIMIT=7 | |
limit = datetime.datetime.now() - datetime.timedelta(days=DAYS_LIMIT) | |
limit_unixtime = time.mktime(limit.timetuple()) | |
# Filter tweets to get rid of those who either have no hashtags or are too old | |
tweetsWithTagsRDD = allTweetsRDD.filter(lambda t: len(t['entities']['hashtags']) > 0) | |
filteredTweetsRDD = tweetsWithTagsRDD.filter(lambda t: time.mktime(parser.parse(t['created_at']).timetuple()) > limit_unixtime) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment