cheruiyot/Tweets_Scraper_final

## Tweets_Scraper_final
# Pip install GetOldTweets3 if you don't already have the package
# !pip install GetOldTweets3

# Imports
import GetOldTweets3 as got
import pandas as pd

import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()


#alwaysimport this for every pyspark analytics

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
#sc = pyspark.SparkContext(conf=conf)
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)


text_query = 'COVID symptoms'
count = 7000
geocode="Paris"

# Function that pulls tweets based on a general search query and turns to csv file

# Parameters: (text query you want to search), (max number of most recent tweets to pull from)

def text_query_to_csv(text_query, count):
    # Creation of query object
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(text_query).setMaxTweets(count).setNear(geocode)
    #.setSince(newest_date1).setUntil(newest_date1)
    # Creation of list that contains all tweets
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)

    # Creating list of chosen tweet data
    text_tweets = [[tweet.date, tweet.text,tweet.id,tweet.username,tweet.geo] for tweet in tweets]

    # Creation of dataframe from tweets
    tweets_df = pd.DataFrame(text_tweets, columns = ['Datetime', 'Text','TweetID','username','geo'])


    # Createspark spark dataframe
    tweets_df_spark = spark.createDataFrame(tweets_df)

    # Converting tweets dataframe to csv file
    #tweets_df_spark.to_csv('C:\\Users\\brono\\First_batch\\Tweets19.csv',index = False,header=True)
    tweets_df_spark.coalesce(1).write.save(path='C:\\Users\\brono\\First_batch\\Finalextract2.csv', format='csv', mode='append', inferSchema = True)

    Finaldf = spark.read.csv("C:\\Users\\brono\\First_batch\\Finalextract2.csv", inferSchema = True, header = True)
    Finaldf = Finaldf.dropDuplicates(subset=['TweetID'])

    Finaldf.sort("TweetID").coalesce(1).write.mode("overwrite").option("header", "true").csv("C:\\Users\\brono\\First_batch\\Cleaned_data.csv")
	# Pip install GetOldTweets3 if you don't already have the package
	# !pip install GetOldTweets3

	# Imports
	import GetOldTweets3 as got
	import pandas as pd

	import findspark
	findspark.init()
	findspark.find()
	import pyspark
	findspark.find()


	#alwaysimport this for every pyspark analytics

	from pyspark import SparkContext, SparkConf
	from pyspark.sql import SparkSession

	conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
	#sc = pyspark.SparkContext(conf=conf)
	sc = SparkContext.getOrCreate(conf=conf)
	spark = SparkSession(sc)


	text_query = 'COVID symptoms'
	count = 7000
	geocode="Paris"

	# Function that pulls tweets based on a general search query and turns to csv file

	# Parameters: (text query you want to search), (max number of most recent tweets to pull from)

	def text_query_to_csv(text_query, count):
	# Creation of query object
	tweetCriteria = got.manager.TweetCriteria().setQuerySearch(text_query).setMaxTweets(count).setNear(geocode)
	#.setSince(newest_date1).setUntil(newest_date1)
	# Creation of list that contains all tweets
	tweets = got.manager.TweetManager.getTweets(tweetCriteria)

	# Creating list of chosen tweet data
	text_tweets = [[tweet.date, tweet.text,tweet.id,tweet.username,tweet.geo] for tweet in tweets]

	# Creation of dataframe from tweets
	tweets_df = pd.DataFrame(text_tweets, columns = ['Datetime', 'Text','TweetID','username','geo'])


	# Createspark spark dataframe
	tweets_df_spark = spark.createDataFrame(tweets_df)

	# Converting tweets dataframe to csv file
	#tweets_df_spark.to_csv('C:\\Users\\brono\\First_batch\\Tweets19.csv',index = False,header=True)
	tweets_df_spark.coalesce(1).write.save(path='C:\\Users\\brono\\First_batch\\Finalextract2.csv', format='csv', mode='append', inferSchema = True)

	Finaldf = spark.read.csv("C:\\Users\\brono\\First_batch\\Finalextract2.csv", inferSchema = True, header = True)
	Finaldf = Finaldf.dropDuplicates(subset=['TweetID'])

	Finaldf.sort("TweetID").coalesce(1).write.mode("overwrite").option("header", "true").csv("C:\\Users\\brono\\First_batch\\Cleaned_data.csv")