Skip to content

Instantly share code, notes, and snippets.

@rachelannelise
Created August 23, 2017 18:15
Show Gist options
  • Save rachelannelise/a539754df97800dd541a56286b1b2767 to your computer and use it in GitHub Desktop.
Save rachelannelise/a539754df97800dd541a56286b1b2767 to your computer and use it in GitHub Desktop.
from __future__ import division
from mage import MageClient
from deployment import hack as settings
from six import iteritems
from redis import Redis
from elasticsearch import Elasticsearch
import datetime as dt
import time
import csv
import pandas as pd
from parselyutils import url_utils
from parselyutils.metadata import MetadataFetcher
# need to run ssh tunnel (dash repo and virtualenv; ./_start -e prod)
# need to start redis server on port 6479 redis-server --port 6479
# if doing this under the dash repo - need to manually pull in mage dictionary, which can be found here:
# https://github.com/Parsely/mage/blob/0.31/deployment/local.py
# mage query that the following is based off
# https://beta.parsely.com/thefader.com/experiments/mage_prep/?q=%5B%0A%20%20%20%20%7B%22apikey%22:%20%22thefader.com%22,%0A%20%20%20%20%20%22ts%22:%20%7B%22from%22:%20%222015-01-01%22,%0A%20%20%20%20%20%20%20%20%20%20%20%20%22to%22:%20%222017-07-20%22%7D%0A%20%20%20%20%7D,%0A%20%20%20%20%22canonical_url%22,%0A%20%20%20%20%5B%22views%22%5D,%0A%20%20%20%20%5B(%22views%22,%20-1)%5D,%0A%5D
# [
# {"apikey": "dallasnews.com",
# "ts": {"from": "2016-09-01",
# "to": "2017-06-07"}
# },
# "canonical_url",
# ["views","visitors","engaged_minutes","visitors_new","visitors_returning","views_new","views_returning","engaged_minutes_new","engaged_minutes_returning","desktop_views","mobile_views","tablet_views","search_refs","internal_refs","other_refs","direct_refs","social_refs","fb_refs","tw_refs","li_refs","pi_refs","social_shares","fb_shares","tw_shares","li_shares","pi_shares"],
# [("views", -1)],
# 0,
# ]
def dict_to_df(d):
df=pd.DataFrame(d.items())
df.set_index(0, inplace=True)
return df
# instantiate a MetadataFetcher and connect it to crawldb
meta_es = Elasticsearch(settings.crawldb_connection)
redis_conn = Redis(host='localhost', port=6479, db=0)
redis_conn.flushdb()
meta_fetcher = MetadataFetcher(meta_es, redis_conn=redis_conn)
missing_urls = set()
# settings
apikey = 'dallasnews.com'
mage = MageClient(settings, ['dallasnews.com'])
origin_time = dt.datetime(2016,9,01)
end_time = dt.datetime(2017,06,07)
final_dict = {}
spec_dict = {"apikey": "dallasnews.com", 'ts': {'from': origin_time, 'to': end_time}, 'page_type': 'post'}
fields = ["views","visitors","engaged_minutes","visitors_new","visitors_returning","views_new","views_returning","engaged_minutes_new","engaged_minutes_returning","desktop_views","mobile_views","tablet_views","search_refs","internal_refs","other_refs","direct_refs","social_refs","fb_refs","tw_refs","li_refs","pi_refs","social_shares","fb_shares","tw_shares","li_shares","pi_shares"]
print 'querying'
# results
results = mage.search(spec_dict, 'canonical_url', fields, [("views", -1)], 0)
print 'results'
result_urls = {url_utils.clean_url(result.get('canonical_url')) for result in results}
# use the fetch_multi MetadataFetcher function to get metadata information for a set of urls
metadata_results = meta_fetcher.fetch_multi(result_urls, apikey, es_namespace='casterisk')
missing_urls |= {url for url, exists in iteritems(metadata_results) if not exists}
# store metadata to csv
#metadata_results.to_csv('dallas_metadata_results.csv')
metadata_df = dict_to_df(metadata_results)
metadata_df.to_csv('dallas_metadata_results.csv', encoding='utf-8')
print 'metadata'
transformed_results = {item["canonical_url"]: item for item in results}
total_results = transformed_results.update(metadata_results)
print 'writing to csv'
#write to csv
with open('dallasnews_posts_traffic_20160901_20170607_extended.csv', 'w') as csvfile:
fieldnames = ('URL','pub_date','title', 'authors','section','tags''pageviews','visitors','engaged_minutes','visitors_new','visitors_returning','views_new','views_returning','engaged_minutes_new','engaged_minutes_returning','desktop_views','mobile_views','tablet_views','search_refs','internal_refs','other_refs','direct_refs','social_refs','fb_refs','tw_refs','li_refs','pi_refs','social_shares','fb_shares','tw_shares','li_shares','pi_shares') # + other field names in the results
csvwriter = csv.DictWriter(csvfile, extrasaction='ignore',fieldnames=fieldnames)
csvwriter.writeheader()
for result in results:
try:
csvwriter.writerow({
'URL': result['canonical_url'],
'pub_date': transformed_results[result['canonical_url']].pub_date_tmsp,
'title': transformed_results[result['canonical_url']].title,
'authors': transformed_results[result['authors']].authors,
'section': transformed_results[result['section']].section,
'tags':transformed_results[result['tags']].tags,
'pageviews': result['views'],
'visitors': result['visitors'],
'engaged_minutes': result['engaged_minutes'],
'visitors_new': result['visitors_new'],
'visitors_returning': result['visitors_returning'],
'views_new': result['views_new'],
'views_returning': result['views_returning'],
'engaged_minutes_new': result['engaged_minutes_new'],
'engaged_minutes_returning': result['engaged_minutes_returning'],
'desktop_views': result['desktop_views'],
'mobile_views': result['mobile_views'],
'tablet_views': result['tablet_views'],
'search_refs': result['search_refs'],
'internal_refs': result['internal_refs'],
'other_refs': result['other_refs'],
'direct_refs': result['direct_refs'],
'social_refs': result['social_refs'],
'fb_refs': result['fb_refs'],
'tw_refs': result['tw_refs'],
'li_refs': result['li_refs'],
'pi_refs': result['pi_refs'],
'social_shares': result['social_shares'],
'fb_shares': result['fb_shares'],
'tw_shares': result['tw_shares'],
'li_shares': result['li_shares'],
'pi_shares': result['pi_shares']
})
except (UnicodeEncodeError, AttributeError, KeyError, ValueError) as e:
csvwriter.writerow({
'URL': result['canonical_url'],
'pub_date': '',
'title': '',
'authors': '',
'section': '',
'tags': '',
'pageviews': result['views'],
'visitors': result['visitors'],
'engaged_minutes': result['engaged_minutes'],
'visitors_new': result['visitors_new'],
'visitors_returning': result['visitors_returning'],
'views_new': result['views_new'],
'views_returning': result['views_returning'],
'engaged_minutes_new': result['engaged_minutes_new'],
'engaged_minutes_returning': result['engaged_minutes_returning'],
'desktop_views': result['desktop_views'],
'mobile_views': result['mobile_views'],
'tablet_views': result['tablet_views'],
'search_refs': result['search_refs'],
'internal_refs': result['internal_refs'],
'other_refs': result['other_refs'],
'direct_refs': result['direct_refs'],
'social_refs': result['social_refs'],
'fb_refs': result['fb_refs'],
'tw_refs': result['tw_refs'],
'li_refs': result['li_refs'],
'pi_refs': result['pi_refs'],
'social_shares': result['social_shares'],
'fb_shares': result['fb_shares'],
'tw_shares': result['tw_shares'],
'li_shares': result['li_shares'],
'pi_shares': result['pi_shares']
})
print "Wrote to CSV"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment