Created
October 20, 2019 23:50
-
-
Save brockmanmatt/7fdb3610fec452b2170fda1387234f48 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install gdelt #make sure gdelt installed | |
import pandas as pd, numpy as np, matplotlib.pyplot as plt, gdelt, os, datetime, warnings #imports | |
gd = gdelt.gdelt(version=1) #instantiate object to pull gdelt files | |
os.makedirs("data",exist_ok=True) #check if there's a data folder | |
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) #start pulling from 60 days prior to 10/7 | |
while cur_date < datetime.datetime(2019,10,7): #pull until 10/7 | |
if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)): #if don't have | |
year = cur_date.year #YYYY | |
month = str(cur_date.month) if cur_date.month >= 10 else "0"+str(cur_date.month) #need MM | |
day = str(cur_date.day) if cur_date.day >= 10 else "0"+str(cur_date.day) #need DD | |
results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False) #pull | |
results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)) #save as pkl | |
cur_date+=datetime.timedelta(days=1) #grab next day | |
mySources = ["cnn.com", "washingtonpost.com", "nytimes.com", "foxnews.com"] #publications to extract from each pickle | |
df = pd.DataFrame() #empty DF, will add in all the selected articles here | |
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) #start pulling from 60 days prior to 10/7 | |
while cur_date < datetime.datetime(2019,10,7): #pull until 10/7 | |
print(cur_date) #just a verbose thing to let know it's going | |
tmp = pd.read_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)).fillna("") #save as pkl #reads pickle, fills in blanks | |
tmp = tmp[tmp["SOURCES"].apply(lambda x: len(set(x.split(";")).intersection(set(mySources))) > 0)] #grabs all relevant | |
df = pd.concat([df, tmp]) #concats it into the current dataframe as new rows | |
cur_date+=datetime.timedelta(days=1) #grab next day |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment