Created
August 23, 2017 18:15
-
-
Save rachelannelise/a539754df97800dd541a56286b1b2767 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
from mage import MageClient | |
from deployment import hack as settings | |
from six import iteritems | |
from redis import Redis | |
from elasticsearch import Elasticsearch | |
import datetime as dt | |
import time | |
import csv | |
import pandas as pd | |
from parselyutils import url_utils | |
from parselyutils.metadata import MetadataFetcher | |
# need to run ssh tunnel (dash repo and virtualenv; ./_start -e prod) | |
# need to start redis server on port 6479 redis-server --port 6479 | |
# if doing this under the dash repo - need to manually pull in mage dictionary, which can be found here: | |
# https://github.com/Parsely/mage/blob/0.31/deployment/local.py | |
# mage query that the following is based off | |
# https://beta.parsely.com/thefader.com/experiments/mage_prep/?q=%5B%0A%20%20%20%20%7B%22apikey%22:%20%22thefader.com%22,%0A%20%20%20%20%20%22ts%22:%20%7B%22from%22:%20%222015-01-01%22,%0A%20%20%20%20%20%20%20%20%20%20%20%20%22to%22:%20%222017-07-20%22%7D%0A%20%20%20%20%7D,%0A%20%20%20%20%22canonical_url%22,%0A%20%20%20%20%5B%22views%22%5D,%0A%20%20%20%20%5B(%22views%22,%20-1)%5D,%0A%5D | |
# [ | |
# {"apikey": "dallasnews.com", | |
# "ts": {"from": "2016-09-01", | |
# "to": "2017-06-07"} | |
# }, | |
# "canonical_url", | |
# ["views","visitors","engaged_minutes","visitors_new","visitors_returning","views_new","views_returning","engaged_minutes_new","engaged_minutes_returning","desktop_views","mobile_views","tablet_views","search_refs","internal_refs","other_refs","direct_refs","social_refs","fb_refs","tw_refs","li_refs","pi_refs","social_shares","fb_shares","tw_shares","li_shares","pi_shares"], | |
# [("views", -1)], | |
# 0, | |
# ] | |
def dict_to_df(d): | |
df=pd.DataFrame(d.items()) | |
df.set_index(0, inplace=True) | |
return df | |
# instantiate a MetadataFetcher and connect it to crawldb | |
meta_es = Elasticsearch(settings.crawldb_connection) | |
redis_conn = Redis(host='localhost', port=6479, db=0) | |
redis_conn.flushdb() | |
meta_fetcher = MetadataFetcher(meta_es, redis_conn=redis_conn) | |
missing_urls = set() | |
# settings | |
apikey = 'dallasnews.com' | |
mage = MageClient(settings, ['dallasnews.com']) | |
origin_time = dt.datetime(2016,9,01) | |
end_time = dt.datetime(2017,06,07) | |
final_dict = {} | |
spec_dict = {"apikey": "dallasnews.com", 'ts': {'from': origin_time, 'to': end_time}, 'page_type': 'post'} | |
fields = ["views","visitors","engaged_minutes","visitors_new","visitors_returning","views_new","views_returning","engaged_minutes_new","engaged_minutes_returning","desktop_views","mobile_views","tablet_views","search_refs","internal_refs","other_refs","direct_refs","social_refs","fb_refs","tw_refs","li_refs","pi_refs","social_shares","fb_shares","tw_shares","li_shares","pi_shares"] | |
print 'querying' | |
# results | |
results = mage.search(spec_dict, 'canonical_url', fields, [("views", -1)], 0) | |
print 'results' | |
result_urls = {url_utils.clean_url(result.get('canonical_url')) for result in results} | |
# use the fetch_multi MetadataFetcher function to get metadata information for a set of urls | |
metadata_results = meta_fetcher.fetch_multi(result_urls, apikey, es_namespace='casterisk') | |
missing_urls |= {url for url, exists in iteritems(metadata_results) if not exists} | |
# store metadata to csv | |
#metadata_results.to_csv('dallas_metadata_results.csv') | |
metadata_df = dict_to_df(metadata_results) | |
metadata_df.to_csv('dallas_metadata_results.csv', encoding='utf-8') | |
print 'metadata' | |
transformed_results = {item["canonical_url"]: item for item in results} | |
total_results = transformed_results.update(metadata_results) | |
print 'writing to csv' | |
#write to csv | |
with open('dallasnews_posts_traffic_20160901_20170607_extended.csv', 'w') as csvfile: | |
fieldnames = ('URL','pub_date','title', 'authors','section','tags''pageviews','visitors','engaged_minutes','visitors_new','visitors_returning','views_new','views_returning','engaged_minutes_new','engaged_minutes_returning','desktop_views','mobile_views','tablet_views','search_refs','internal_refs','other_refs','direct_refs','social_refs','fb_refs','tw_refs','li_refs','pi_refs','social_shares','fb_shares','tw_shares','li_shares','pi_shares') # + other field names in the results | |
csvwriter = csv.DictWriter(csvfile, extrasaction='ignore',fieldnames=fieldnames) | |
csvwriter.writeheader() | |
for result in results: | |
try: | |
csvwriter.writerow({ | |
'URL': result['canonical_url'], | |
'pub_date': transformed_results[result['canonical_url']].pub_date_tmsp, | |
'title': transformed_results[result['canonical_url']].title, | |
'authors': transformed_results[result['authors']].authors, | |
'section': transformed_results[result['section']].section, | |
'tags':transformed_results[result['tags']].tags, | |
'pageviews': result['views'], | |
'visitors': result['visitors'], | |
'engaged_minutes': result['engaged_minutes'], | |
'visitors_new': result['visitors_new'], | |
'visitors_returning': result['visitors_returning'], | |
'views_new': result['views_new'], | |
'views_returning': result['views_returning'], | |
'engaged_minutes_new': result['engaged_minutes_new'], | |
'engaged_minutes_returning': result['engaged_minutes_returning'], | |
'desktop_views': result['desktop_views'], | |
'mobile_views': result['mobile_views'], | |
'tablet_views': result['tablet_views'], | |
'search_refs': result['search_refs'], | |
'internal_refs': result['internal_refs'], | |
'other_refs': result['other_refs'], | |
'direct_refs': result['direct_refs'], | |
'social_refs': result['social_refs'], | |
'fb_refs': result['fb_refs'], | |
'tw_refs': result['tw_refs'], | |
'li_refs': result['li_refs'], | |
'pi_refs': result['pi_refs'], | |
'social_shares': result['social_shares'], | |
'fb_shares': result['fb_shares'], | |
'tw_shares': result['tw_shares'], | |
'li_shares': result['li_shares'], | |
'pi_shares': result['pi_shares'] | |
}) | |
except (UnicodeEncodeError, AttributeError, KeyError, ValueError) as e: | |
csvwriter.writerow({ | |
'URL': result['canonical_url'], | |
'pub_date': '', | |
'title': '', | |
'authors': '', | |
'section': '', | |
'tags': '', | |
'pageviews': result['views'], | |
'visitors': result['visitors'], | |
'engaged_minutes': result['engaged_minutes'], | |
'visitors_new': result['visitors_new'], | |
'visitors_returning': result['visitors_returning'], | |
'views_new': result['views_new'], | |
'views_returning': result['views_returning'], | |
'engaged_minutes_new': result['engaged_minutes_new'], | |
'engaged_minutes_returning': result['engaged_minutes_returning'], | |
'desktop_views': result['desktop_views'], | |
'mobile_views': result['mobile_views'], | |
'tablet_views': result['tablet_views'], | |
'search_refs': result['search_refs'], | |
'internal_refs': result['internal_refs'], | |
'other_refs': result['other_refs'], | |
'direct_refs': result['direct_refs'], | |
'social_refs': result['social_refs'], | |
'fb_refs': result['fb_refs'], | |
'tw_refs': result['tw_refs'], | |
'li_refs': result['li_refs'], | |
'pi_refs': result['pi_refs'], | |
'social_shares': result['social_shares'], | |
'fb_shares': result['fb_shares'], | |
'tw_shares': result['tw_shares'], | |
'li_shares': result['li_shares'], | |
'pi_shares': result['pi_shares'] | |
}) | |
print "Wrote to CSV" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment