Created
April 10, 2020 15:23
-
-
Save cheruiyot/369e5d99489ef55558ce1d5df2087c64 to your computer and use it in GitHub Desktop.
Tweets Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pip install GetOldTweets3 if you don't already have the package | |
# !pip install GetOldTweets3 | |
# Imports | |
import GetOldTweets3 as got | |
import pandas as pd | |
import findspark | |
findspark.init() | |
findspark.find() | |
import pyspark | |
findspark.find() | |
#alwaysimport this for every pyspark analytics | |
from pyspark import SparkContext, SparkConf | |
from pyspark.sql import SparkSession | |
conf = pyspark.SparkConf().setAppName('appName').setMaster('local') | |
#sc = pyspark.SparkContext(conf=conf) | |
sc = SparkContext.getOrCreate(conf=conf) | |
spark = SparkSession(sc) | |
text_query = 'COVID symptoms' | |
count = 7000 | |
geocode="Paris" | |
# Function that pulls tweets based on a general search query and turns to csv file | |
# Parameters: (text query you want to search), (max number of most recent tweets to pull from) | |
def text_query_to_csv(text_query, count): | |
# Creation of query object | |
tweetCriteria = got.manager.TweetCriteria().setQuerySearch(text_query).setMaxTweets(count).setNear(geocode) | |
#.setSince(newest_date1).setUntil(newest_date1) | |
# Creation of list that contains all tweets | |
tweets = got.manager.TweetManager.getTweets(tweetCriteria) | |
# Creating list of chosen tweet data | |
text_tweets = [[tweet.date, tweet.text,tweet.id,tweet.username,tweet.geo] for tweet in tweets] | |
# Creation of dataframe from tweets | |
tweets_df = pd.DataFrame(text_tweets, columns = ['Datetime', 'Text','TweetID','username','geo']) | |
# Createspark spark dataframe | |
tweets_df_spark = spark.createDataFrame(tweets_df) | |
# Converting tweets dataframe to csv file | |
#tweets_df_spark.to_csv('C:\\Users\\brono\\First_batch\\Tweets19.csv',index = False,header=True) | |
tweets_df_spark.coalesce(1).write.save(path='C:\\Users\\brono\\First_batch\\Finalextract2.csv', format='csv', mode='append', inferSchema = True) | |
Finaldf = spark.read.csv("C:\\Users\\brono\\First_batch\\Finalextract2.csv", inferSchema = True, header = True) | |
Finaldf = Finaldf.dropDuplicates(subset=['TweetID']) | |
Finaldf.sort("TweetID").coalesce(1).write.mode("overwrite").option("header", "true").csv("C:\\Users\\brono\\First_batch\\Cleaned_data.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment