brockmanmatt/LoadNewsDFFromGDELT.py

## LoadNewsDFFromGDELT.py
!pip install gdelt #make sure gdelt installed
import pandas as pd, numpy as np, matplotlib.pyplot as plt, gdelt, os, datetime, warnings #imports
gd = gdelt.gdelt(version=1) #instantiate object to pull gdelt files

os.makedirs("data",exist_ok=True) #check if there's a data folder

cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) #start pulling from 60 days prior to 10/7

while cur_date < datetime.datetime(2019,10,7): #pull until 10/7
  if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)): #if don't have
    year = cur_date.year #YYYY
    month = str(cur_date.month) if cur_date.month >= 10 else "0"+str(cur_date.month) #need MM
    day = str(cur_date.day) if cur_date.day >= 10 else "0"+str(cur_date.day) #need DD
    results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False) #pull
    results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)) #save as pkl
  cur_date+=datetime.timedelta(days=1) #grab next day

mySources = ["cnn.com", "washingtonpost.com", "nytimes.com", "foxnews.com"] #publications to extract from each pickle

df = pd.DataFrame() #empty DF, will add in all the selected articles here
cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) #start pulling from 60 days prior to 10/7
while cur_date < datetime.datetime(2019,10,7): #pull until 10/7
  print(cur_date) #just a verbose thing to let know it's going
  tmp = pd.read_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)).fillna("") #save as pkl #reads pickle, fills in blanks
  tmp = tmp[tmp["SOURCES"].apply(lambda x: len(set(x.split(";")).intersection(set(mySources))) > 0)] #grabs all relevant
  df = pd.concat([df, tmp]) #concats it into the current dataframe as new rows
  cur_date+=datetime.timedelta(days=1) #grab next day
	!pip install gdelt #make sure gdelt installed
	import pandas as pd, numpy as np, matplotlib.pyplot as plt, gdelt, os, datetime, warnings #imports
	gd = gdelt.gdelt(version=1) #instantiate object to pull gdelt files

	os.makedirs("data",exist_ok=True) #check if there's a data folder

	cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) #start pulling from 60 days prior to 10/7

	while cur_date < datetime.datetime(2019,10,7): #pull until 10/7
	if not os.path.exists("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)): #if don't have
	year = cur_date.year #YYYY
	month = str(cur_date.month) if cur_date.month >= 10 else "0"+str(cur_date.month) #need MM
	day = str(cur_date.day) if cur_date.day >= 10 else "0"+str(cur_date.day) #need DD
	results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False) #pull
	results.to_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)) #save as pkl
	cur_date+=datetime.timedelta(days=1) #grab next day

	mySources = ["cnn.com", "washingtonpost.com", "nytimes.com", "foxnews.com"] #publications to extract from each pickle

	df = pd.DataFrame() #empty DF, will add in all the selected articles here
	cur_date = datetime.datetime(2019,10,7)-datetime.timedelta(days=60) #start pulling from 60 days prior to 10/7
	while cur_date < datetime.datetime(2019,10,7): #pull until 10/7
	print(cur_date) #just a verbose thing to let know it's going
	tmp = pd.read_pickle("data/%s-%s-%s.pkl"%(cur_date.year, cur_date.month, cur_date.day)).fillna("") #save as pkl #reads pickle, fills in blanks
	tmp = tmp[tmp["SOURCES"].apply(lambda x: len(set(x.split(";")).intersection(set(mySources))) > 0)] #grabs all relevant
	df = pd.concat([df, tmp]) #concats it into the current dataframe as new rows
	cur_date+=datetime.timedelta(days=1) #grab next day