Skip to content

Instantly share code, notes, and snippets.

@Nikhel1
Created March 25, 2020 13:30
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save Nikhel1/906bde97b205158489444f14ba6c8278 to your computer and use it in GitHub Desktop.
from newsplease import NewsPlease
import pandas as pd
import nltk
from tqdm import tnrange
import re
import pickle
import os
month = 'en_interim01_v2'
df = pd.read_csv("{}.csv".format(month))
df = df.drop_duplicates(['to_scrape'])
df = df[pd.notnull(df['ActionGeo_CountryCode'])]
urls = df['to_scrape'].unique()
def save_obj(obj, country, name):
'Helper function using pickle to save and load objects'
with open('text_v2/' + str(month) + '/' +str(country)+ '/' + name + '.pkl', 'wb+') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
#import multiprocessing
month = 1
for k in (df['ActionGeo_CountryCode'].unique()):
if not os.path.exists("text_v2/{}/{}".format(str(month), str(k))):
os.makedirs("text_v2/{}/{}".format(str(month), str(k)))
def download_url(i):
try:
country = df['ActionGeo_CountryCode'][i]
print (country)
article = NewsPlease.from_url(urls[i])
save_obj(article, str(country), str(i).zfill(5))
return 1
except Exception as ex:
print(i, ex)
return 0
for i in range(len(urls)):
download_url(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment