This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tweepy # Python Library for scrapping Twitter Data. | |
import configparser #ConfigParser is a Python class which implements a basic configuration language for Python programs. | |
import pandas as pd #Data Manipulation & Transformation | |
from datetime import date | |
import re #RegEx : Regular expression |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#read configs | |
config = configparser.ConfigParser() | |
config.read("config.ini") #The Config.ini file we just created | |
api_key = config["twitter"]["api_key"] | |
api_key_secret = config["twitter"]["api_key_secret"] | |
access_token = config["twitter"]["access_token"] | |
access_token_secret = config["twitter"]["access_token_secret"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Authentificate account to twitter App | |
#API Instance | |
auth = tweepy.OAuth1UserHandler( | |
api_key, api_key_secret, access_token, access_token_secret | |
) | |
api = tweepy.API(auth) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get tweets from the API | |
tweets = tweepy.Cursor(api.search_tweets, q=search_query, lang="en").items(1000) #Set limit to 1000 tweet. | |
#Column header | |
columns = ["User", "Time", "Tweet", "Location", "Verified", "Tweet_Source", "Followers", "Retweet_Count", "Tweet ID"] | |
data = [] | |
for tweet in tweets: | |
data.append([tweet.user.screen_name, tweet.created_at, tweet.text, tweet.user.location, tweet.user.verified, | |
tweet.source,tweet.user.followers_count,tweet.retweet_count,tweet.id]) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import glob | |
import pandas as pd | |
os.chdir("/mydir") # Change "/mydir" to your desired working directory. | |
extension = 'csv' #This will help select only the csv file. | |
all_filenames = [i for i in glob.glob('*.{}'.format(extension))] | |
#combine all files in the list | |
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ]) | |
#export to csv | |
combined_csv.to_csv( "Append.csv", index=False, encoding='utf-8-sig') #encoding = 'utf-8-sig' is added to overcome the issue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Burna_Data_Damini.drop(['Unnamed: 0'], 1, inplace=True) #This is used to remove unwated columns "Unnamed" | |
Burna_Data_Damini.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Burna_Data_Damini.shape #Get the number of Rows & Columns | |
Burna_Data_Damini.info() # Get information about the data | |
Burna_Data_Damini.isnull().sum() # Get the empty cells |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Burna_Data_Damini.drop_duplicates(inplace=True) # We can set the argumentinplace=True to remove duplicates | |
#from the original DataFrame |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Clean the text | |
#Create a function to clean the tweets | |
def cleanTxt(text): | |
text = re.sub(r'@[A-Za-z0-9]+', '', text) #Remove @mentions replace with blank | |
text = re.sub(r'#', '', text) #Remove the '#' symbol, replace with blank | |
text = re.sub(r'RT[\s]+', '', text) #Removing RT, replace with blank | |
text = re.sub(r'https?:\/\/\S+', '', text) #Remove the hyperlinks | |
text = re.sub(r':', '', text) # Remove : | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Next we have to remove emoji & Unicode from the Tweet data. | |
def remove_emoji(string): | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
u"\U00002500-\U00002BEF" # chinese char | |
u"\U00002702-\U000027B0" | |
u"\U00002702-\U000027B0" |
OlderNewer