Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from newsplease import NewsPlease
import pandas as pd
import nltk
from tqdm import tnrange
import re
import pickle
import os
month = 'en_interim01_v2'
df = pd.read_csv("{}.csv".format(month))
df = df.drop_duplicates(['to_scrape'])
df = df[pd.notnull(df['ActionGeo_CountryCode'])]
urls = df['to_scrape'].unique()
def save_obj(obj, country, name):
'Helper function using pickle to save and load objects'
with open('text_v2/' + str(month) + '/' +str(country)+ '/' + name + '.pkl', 'wb+') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
#import multiprocessing
month = 1
for k in (df['ActionGeo_CountryCode'].unique()):
if not os.path.exists("text_v2/{}/{}".format(str(month), str(k))):
os.makedirs("text_v2/{}/{}".format(str(month), str(k)))
def download_url(i):
try:
country = df['ActionGeo_CountryCode'][i]
print (country)
article = NewsPlease.from_url(urls[i])
save_obj(article, str(country), str(i).zfill(5))
return 1
except Exception as ex:
print(i, ex)
return 0
for i in range(len(urls)):
download_url(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment