natyrix/extract_dataframe.py

## extract_dataframe.py
import json
import pandas as pd
from textblob import TextBlob
import re


def read_json(json_file: str) -> list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    Returns
    -------
    length of the json file and a list of json
    """

    tweets_data = []
    for tweets in open(json_file, 'r'):
        tweets_data.append(json.loads(tweets))

    return len(tweets_data), tweets_data


class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    Return
    ------
    dataframe
    """

    def __init__(self, tweets_list):

        self.tweets_list = tweets_list

    # an example function
    def find_statuses_count(self) -> list:
        statuses_count = [tweet['user']['statuses_count']
                          for tweet in self.tweets_list]
        return statuses_count

    def find_full_text(self) -> list:
        text = [tweet['full_text'] for tweet in self.tweets_list]
        text = [tweet['full_text'].replace(',', ' ')
                for tweet in self.tweets_list]
        return text

    def find_sentiments(self, text) -> list:
        polarity = []
        subjectivity = []
        sentiment = []
        xo = True
        for t in text:
            each_sentiment = TextBlob(t).sentiment
            polarity.append(each_sentiment.polarity)
            subjectivity.append(each_sentiment.subjectivity)
            if each_sentiment.polarity > 0:
                sentiment.append("positive")
            elif each_sentiment.polarity < 0:
                sentiment.append("negative")
            else:
                sentiment.append("neutral")
        self.subjectivity = subjectivity
        return polarity, self.subjectivity, sentiment

    def find_created_time(self) -> list:

        created_at = [tweet['created_at'] for tweet in self.tweets_list]

        return created_at

    def find_source(self) -> list:
        source = [tweet['source'] for tweet in self.tweets_list]

        return source

    def find_screen_name(self) -> list:
        screen_name = [tweet['user']['screen_name'].replace(
            ',', ' ') for tweet in self.tweets_list]
        return screen_name

    def find_followers_count(self) -> list:
        followers_count = [tweet['user']['followers_count']
                           for tweet in self.tweets_list]

        return followers_count

    def find_friends_count(self) -> list:
        friends_count = [tweet['user']['friends_count']
                         for tweet in self.tweets_list]

        return friends_count

    def is_sensitive(self) -> list:
        try:
            is_sensitive = [tweet['possibly_sensitive'] if 'possibly_sensitive' in tweet else None
                            for tweet in self.tweets_list]
        except KeyError:
            is_sensitive = []

        return is_sensitive

    def find_favourite_count(self) -> list:
        favorite_count = [tweet['user']['favourites_count']
                          for tweet in self.tweets_list]
        return favorite_count

    def find_retweet_count(self) -> list:
        retweet_count = [tweet['retweet_count'] for tweet in self.tweets_list]
        return retweet_count

    def find_hashtags(self) -> list:
        hashtags = []
        for tweet in self.tweets_list:
            values = ""
            for hashtag in tweet['entities']['hashtags']:
                if hashtag['text'] != "" or hashtag['text'] != " ":
                    values = values + \
                        hashtag['text'].replace(',', ' ') + "++++"
            hashtags.append(values)

        return hashtags

    def find_mentions(self) -> list:
        mentions = []
        for tweet in self.tweets_list:
            values = ""
            for user_mentions in tweet['entities']['user_mentions']:
                if user_mentions['screen_name'] != "" or user_mentions['screen_name'] != " ":
                    values = values + \
                        user_mentions['screen_name'].replace(',', ' ') + "++++"
            mentions.append(values)

        return mentions

    def find_location(self) -> list:
        try:
            location = [tweet['user']['location']
                        for tweet in self.tweets_list]
        except TypeError:
            location = []

        return location

    def find_lang(self) -> list:

        lang = [tweet['lang'] for tweet in self.tweets_list]

        return lang

    def find_clean_text(self) -> list:
        clean_text = [re.sub("[^a-zA-Z0-9#@\s’,_]", "", text)
                      for text in self.find_full_text()]
        clean_text = [re.sub("\s+", " ", text) for text in clean_text]
        return clean_text

    def get_tweet_df(self, save=False) -> pd.DataFrame:
        """required column to be generated you should be creative and add more features"""

        columns = ['created_at', 'source', 'full_text', 'polarity', 'subjectivity', 'sentiment','lang', 'favorite_count', 'retweet_count',
                   'original_author', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place']

        created_at = self.find_created_time()
        source = self.find_source()
        full_text = self.find_full_text()
        polarity, subjectivity, sentiment = self.find_sentiments(full_text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        followers_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        full_text = self.find_clean_text()
        data = zip(created_at, source, full_text, polarity, subjectivity, sentiment, lang, fav_count, retweet_count,
                   screen_name, followers_count, friends_count, sensitivity, hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv('processed_tweet_data_global.csv', index=False)
            print('File Successfully Saved.!!!')
        return df


if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ['created_at', 'source', 'original_text', 'clean_text', 'sentiment', 'polarity', 'subjectivity', 'lang', 'favorite_count', 'retweet_count',
               'original_author', 'screen_count', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
    _, tweet_list = read_json(
        "./data/global_twitter_data.json")
    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df(save=True)
	import json
	import pandas as pd
	from textblob import TextBlob
	import re


	def read_json(json_file: str) -> list:
	"""
	json file reader to open and read json files into a list
	Args:
	-----
	json_file: str - path of a json file
	Returns
	-------
	length of the json file and a list of json
	"""

	tweets_data = []
	for tweets in open(json_file, 'r'):
	tweets_data.append(json.loads(tweets))

	return len(tweets_data), tweets_data


	class TweetDfExtractor:
	"""
	this function will parse tweets json into a pandas dataframe
	Return
	------
	dataframe
	"""

	def __init__(self, tweets_list):

	self.tweets_list = tweets_list

	# an example function
	def find_statuses_count(self) -> list:
	statuses_count = [tweet['user']['statuses_count']
	for tweet in self.tweets_list]
	return statuses_count

	def find_full_text(self) -> list:
	text = [tweet['full_text'] for tweet in self.tweets_list]
	text = [tweet['full_text'].replace(',', ' ')
	for tweet in self.tweets_list]
	return text

	def find_sentiments(self, text) -> list:
	polarity = []
	subjectivity = []
	sentiment = []
	xo = True
	for t in text:
	each_sentiment = TextBlob(t).sentiment
	polarity.append(each_sentiment.polarity)
	subjectivity.append(each_sentiment.subjectivity)
	if each_sentiment.polarity > 0:
	sentiment.append("positive")
	elif each_sentiment.polarity < 0:
	sentiment.append("negative")
	else:
	sentiment.append("neutral")
	self.subjectivity = subjectivity
	return polarity, self.subjectivity, sentiment

	def find_created_time(self) -> list:

	created_at = [tweet['created_at'] for tweet in self.tweets_list]

	return created_at

	def find_source(self) -> list:
	source = [tweet['source'] for tweet in self.tweets_list]

	return source

	def find_screen_name(self) -> list:
	screen_name = [tweet['user']['screen_name'].replace(
	',', ' ') for tweet in self.tweets_list]
	return screen_name

	def find_followers_count(self) -> list:
	followers_count = [tweet['user']['followers_count']
	for tweet in self.tweets_list]

	return followers_count

	def find_friends_count(self) -> list:
	friends_count = [tweet['user']['friends_count']
	for tweet in self.tweets_list]

	return friends_count

	def is_sensitive(self) -> list:
	try:
	is_sensitive = [tweet['possibly_sensitive'] if 'possibly_sensitive' in tweet else None
	for tweet in self.tweets_list]
	except KeyError:
	is_sensitive = []

	return is_sensitive

	def find_favourite_count(self) -> list:
	favorite_count = [tweet['user']['favourites_count']
	for tweet in self.tweets_list]
	return favorite_count

	def find_retweet_count(self) -> list:
	retweet_count = [tweet['retweet_count'] for tweet in self.tweets_list]
	return retweet_count

	def find_hashtags(self) -> list:
	hashtags = []
	for tweet in self.tweets_list:
	values = ""
	for hashtag in tweet['entities']['hashtags']:
	if hashtag['text'] != "" or hashtag['text'] != " ":
	values = values + \
	hashtag['text'].replace(',', ' ') + "++++"
	hashtags.append(values)

	return hashtags

	def find_mentions(self) -> list:
	mentions = []
	for tweet in self.tweets_list:
	values = ""
	for user_mentions in tweet['entities']['user_mentions']:
	if user_mentions['screen_name'] != "" or user_mentions['screen_name'] != " ":
	values = values + \
	user_mentions['screen_name'].replace(',', ' ') + "++++"
	mentions.append(values)

	return mentions

	def find_location(self) -> list:
	try:
	location = [tweet['user']['location']
	for tweet in self.tweets_list]
	except TypeError:
	location = []

	return location

	def find_lang(self) -> list:

	lang = [tweet['lang'] for tweet in self.tweets_list]

	return lang

	def find_clean_text(self) -> list:
	clean_text = [re.sub("[^a-zA-Z0-9#@\s’,_]", "", text)
	for text in self.find_full_text()]
	clean_text = [re.sub("\s+", " ", text) for text in clean_text]
	return clean_text

	def get_tweet_df(self, save=False) -> pd.DataFrame:
	"""required column to be generated you should be creative and add more features"""

	columns = ['created_at', 'source', 'full_text', 'polarity', 'subjectivity', 'sentiment','lang', 'favorite_count', 'retweet_count',
	'original_author', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place']

	created_at = self.find_created_time()
	source = self.find_source()
	full_text = self.find_full_text()
	polarity, subjectivity, sentiment = self.find_sentiments(full_text)
	lang = self.find_lang()
	fav_count = self.find_favourite_count()
	retweet_count = self.find_retweet_count()
	screen_name = self.find_screen_name()
	followers_count = self.find_followers_count()
	friends_count = self.find_friends_count()
	sensitivity = self.is_sensitive()
	hashtags = self.find_hashtags()
	mentions = self.find_mentions()
	location = self.find_location()
	full_text = self.find_clean_text()
	data = zip(created_at, source, full_text, polarity, subjectivity, sentiment, lang, fav_count, retweet_count,
	screen_name, followers_count, friends_count, sensitivity, hashtags, mentions, location)
	df = pd.DataFrame(data=data, columns=columns)

	if save:
	df.to_csv('processed_tweet_data_global.csv', index=False)
	print('File Successfully Saved.!!!')
	return df


	if __name__ == "__main__":
	# required column to be generated you should be creative and add more features
	columns = ['created_at', 'source', 'original_text', 'clean_text', 'sentiment', 'polarity', 'subjectivity', 'lang', 'favorite_count', 'retweet_count',
	'original_author', 'screen_count', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
	_, tweet_list = read_json(
	"./data/global_twitter_data.json")
	tweet = TweetDfExtractor(tweet_list)
	tweet_df = tweet.get_tweet_df(save=True)