Nikhel1/news_please.py

## news_please.py
from newsplease import NewsPlease
import pandas as pd
import nltk
from tqdm import tnrange
import re
import pickle
import os

month = 'en_interim01_v2'


df = pd.read_csv("{}.csv".format(month))
df = df.drop_duplicates(['to_scrape'])
df = df[pd.notnull(df['ActionGeo_CountryCode'])]
urls = df['to_scrape'].unique()


def save_obj(obj, country, name):
    'Helper function using pickle to save and load objects'
    with open('text_v2/' + str(month) + '/' +str(country)+ '/' + name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


#import multiprocessing

month = 1

for k in (df['ActionGeo_CountryCode'].unique()):
	if not os.path.exists("text_v2/{}/{}".format(str(month), str(k))):
		os.makedirs("text_v2/{}/{}".format(str(month), str(k)))


def download_url(i):
    try:
        country = df['ActionGeo_CountryCode'][i]
        print (country)
        article = NewsPlease.from_url(urls[i])
        save_obj(article, str(country), str(i).zfill(5))
        return 1
    except Exception as ex:
        print(i, ex)
        return 0

for i in range(len(urls)):
	download_url(i)
	from newsplease import NewsPlease
	import pandas as pd
	import nltk
	from tqdm import tnrange
	import re
	import pickle
	import os

	month = 'en_interim01_v2'


	df = pd.read_csv("{}.csv".format(month))
	df = df.drop_duplicates(['to_scrape'])
	df = df[pd.notnull(df['ActionGeo_CountryCode'])]
	urls = df['to_scrape'].unique()


	def save_obj(obj, country, name):
	'Helper function using pickle to save and load objects'
	with open('text_v2/' + str(month) + '/' +str(country)+ '/' + name + '.pkl', 'wb+') as f:
	pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


	#import multiprocessing

	month = 1

	for k in (df['ActionGeo_CountryCode'].unique()):
	if not os.path.exists("text_v2/{}/{}".format(str(month), str(k))):
	os.makedirs("text_v2/{}/{}".format(str(month), str(k)))


	def download_url(i):
	try:
	country = df['ActionGeo_CountryCode'][i]
	print (country)
	article = NewsPlease.from_url(urls[i])
	save_obj(article, str(country), str(i).zfill(5))
	return 1
	except Exception as ex:
	print(i, ex)
	return 0

	for i in range(len(urls)):
	download_url(i)