Skip to content

Instantly share code, notes, and snippets.

@brockmanmatt
Created October 20, 2019 23:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brockmanmatt/7fdb3610fec452b2170fda1387234f48 to your computer and use it in GitHub Desktop.
Save brockmanmatt/7fdb3610fec452b2170fda1387234f48 to your computer and use it in GitHub Desktop.
!pip install gdelt #make sure gdelt installed
import pandas as pd, numpy as np, matplotlib.pyplot as plt, gdelt, os, datetime, warnings #imports
gd = gdelt.gdelt(version=1) #instantiate object to pull gdelt files
os.makedirs("data",exist_ok=True) #check if there's a data folder
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) #start pulling from 60 days prior to 10/7
while cur_date < datetime.datetime(2019,10,7): #pull until 10/7
if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)): #if don't have
year = cur_date.year #YYYY
month = str(cur_date.month) if cur_date.month >= 10 else "0"+str(cur_date.month) #need MM
day = str(cur_date.day) if cur_date.day >= 10 else "0"+str(cur_date.day) #need DD
results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False) #pull
results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)) #save as pkl
cur_date+=datetime.timedelta(days=1) #grab next day
mySources = ["cnn.com", "washingtonpost.com", "nytimes.com", "foxnews.com"] #publications to extract from each pickle
df = pd.DataFrame() #empty DF, will add in all the selected articles here
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) #start pulling from 60 days prior to 10/7
while cur_date < datetime.datetime(2019,10,7): #pull until 10/7
print(cur_date) #just a verbose thing to let know it's going
tmp = pd.read_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)).fillna("") #save as pkl #reads pickle, fills in blanks
tmp = tmp[tmp["SOURCES"].apply(lambda x: len(set(x.split(";")).intersection(set(mySources))) > 0)] #grabs all relevant
df = pd.concat([df, tmp]) #concats it into the current dataframe as new rows
cur_date+=datetime.timedelta(days=1) #grab next day
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment