from newsplease import NewsPlease
import pandas as pd
import nltk
from tqdm import tnrange
import re
import pickle
import os
month = 'en_interim01_v2'
df = pd.read_csv("{}.csv".format(month))
df = df.drop_duplicates(['to_scrape'])
df = df[pd.notnull(df['ActionGeo_CountryCode'])]
urls = df['to_scrape'].unique()
def save_obj(obj, country, name):
'Helper function using pickle to save and load objects'
with open('text_v2/' + str(month) + '/' +str(country)+ '/' + name + '.pkl', 'wb+') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
#import multiprocessing
month = 1
for k in (df['ActionGeo_CountryCode'].unique()):
if not os.path.exists("text_v2/{}/{}".format(str(month), str(k))):
os.makedirs("text_v2/{}/{}".format(str(month), str(k)))
def download_url(i):
country = df['ActionGeo_CountryCode'][i]
print (country)
article = NewsPlease.from_url(urls[i])
save_obj(article, str(country), str(i).zfill(5))
return 1
except Exception as ex:
print(i, ex)
return 0
for i in range(len(urls)):
