Last active
September 4, 2017 16:19
-
-
Save karishmadudani/9393e3fb80df49dbec629ad8b4d36f23 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Store tweets data in a dataframe | |
def tweets_df(results): | |
id_list = [tweet.id for tweet in results] | |
data_set = pd.DataFrame(id_list, columns = ["id"]) | |
data_set["text"] = [tweet.text for tweet in results] | |
data_set["created_at"] = [tweet.created_at for tweet in results] | |
data_set["retweet_count"] = [tweet.retweet_count for tweet in results] | |
data_set["user_screen_name"] = [tweet.author.screen_name for tweet in results] | |
data_set["user_followers_count"] = [tweet.author.followers_count for tweet in results] | |
data_set["user_location"] = [tweet.author.location for tweet in results] | |
data_set["Hashtags"] = [tweet.entities.get('hashtags') for tweet in results] | |
return data_set | |
data_set = tweets_df(results) | |
# Remove tweets with duplicate text | |
text = data_set["text"] | |
for i in range(0,len(text)): | |
txt = ' '.join(word for word in text[i] .split() if not word.startswith('https:')) | |
data_set.set_value(i, 'text2', txt) | |
data_set.drop_duplicates('text2', inplace=True) | |
data_set.reset_index(drop = True, inplace=True) | |
data_set.drop('text', axis = 1, inplace = True) | |
data_set.rename(columns={'text2': 'text'}, inplace=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment