Skip to content

Instantly share code, notes, and snippets.

@cheruiyot
Created April 10, 2020 15:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save cheruiyot/369e5d99489ef55558ce1d5df2087c64 to your computer and use it in GitHub Desktop.
Save cheruiyot/369e5d99489ef55558ce1d5df2087c64 to your computer and use it in GitHub Desktop.
Tweets Scraper
# Pip install GetOldTweets3 if you don't already have the package
# !pip install GetOldTweets3
# Imports
import GetOldTweets3 as got
import pandas as pd
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
#alwaysimport this for every pyspark analytics
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
#sc = pyspark.SparkContext(conf=conf)
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
text_query = 'COVID symptoms'
count = 7000
geocode="Paris"
# Function that pulls tweets based on a general search query and turns to csv file
# Parameters: (text query you want to search), (max number of most recent tweets to pull from)
def text_query_to_csv(text_query, count):
# Creation of query object
tweetCriteria = got.manager.TweetCriteria().setQuerySearch(text_query).setMaxTweets(count).setNear(geocode)
#.setSince(newest_date1).setUntil(newest_date1)
# Creation of list that contains all tweets
tweets = got.manager.TweetManager.getTweets(tweetCriteria)
# Creating list of chosen tweet data
text_tweets = [[tweet.date, tweet.text,tweet.id,tweet.username,tweet.geo] for tweet in tweets]
# Creation of dataframe from tweets
tweets_df = pd.DataFrame(text_tweets, columns = ['Datetime', 'Text','TweetID','username','geo'])
# Createspark spark dataframe
tweets_df_spark = spark.createDataFrame(tweets_df)
# Converting tweets dataframe to csv file
#tweets_df_spark.to_csv('C:\\Users\\brono\\First_batch\\Tweets19.csv',index = False,header=True)
tweets_df_spark.coalesce(1).write.save(path='C:\\Users\\brono\\First_batch\\Finalextract2.csv', format='csv', mode='append', inferSchema = True)
Finaldf = spark.read.csv("C:\\Users\\brono\\First_batch\\Finalextract2.csv", inferSchema = True, header = True)
Finaldf = Finaldf.dropDuplicates(subset=['TweetID'])
Finaldf.sort("TweetID").coalesce(1).write.mode("overwrite").option("header", "true").csv("C:\\Users\\brono\\First_batch\\Cleaned_data.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment