Last active
August 29, 2015 14:01
-
-
Save d136o/ad79dcffded4dd442993 to your computer and use it in GitHub Desktop.
Fetched data from Full Contact API and throws it into SQLite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import codecs | |
import json | |
import os.path | |
import requests | |
import sys | |
import sqlite3 | |
import time | |
import traceback | |
# three columns: | |
# email (with unique constraint) | |
# status | |
# information | |
def main(args=None): | |
FULL_CONTACT_URL = "https://api.fullcontact.com/v2/person.json" | |
FULL_CONTACT_EMAIL_TEMPLATE_URL = "https://api.fullcontact.com/v2/person.json?email={0}" | |
FULL_CONTACT_BATCH_URL = "https://api.fullcontact.com/v2/batch.json" | |
BATCH_SIZE = 20 | |
if args is None: | |
args = sys.argv | |
parser = argparse.ArgumentParser(description='Gets FullContact demographic information for every email in a list of emails') | |
parser.add_argument('sqlite_filename', | |
help='Sqlite db File containing user emails') | |
parser.add_argument('apiKey', | |
help='the full contact apikey that allows us to make api requests') | |
args = parser.parse_args(args[1:]) | |
db_connection = sqlite3.connect(args.sqlite_filename) | |
db_cursor = db_connection.cursor() | |
db_inner_cursor = db_connection.cursor() | |
db_cursor.execute('SELECT email,status,data from full_contact_info WHERE status=0;') | |
rows = db_cursor.fetchmany(BATCH_SIZE) | |
count = 0 | |
while len(rows) > 0: | |
urls_to_emails = { FULL_CONTACT_EMAIL_TEMPLATE_URL.format(r[0]) : r[0] for r in rows } | |
individual_query_urls = list(urls_to_emails.keys()) | |
payload = json.dumps({"requests":individual_query_urls}) | |
fc_response = requests.post(FULL_CONTACT_BATCH_URL, | |
params={'apiKey':args.apiKey}, | |
data=payload) | |
response_json = json.loads(fc_response.text.encode('utf-8')) | |
if response_json['status'] == 200: | |
for query_url, response in response_json['responses'].iteritems(): | |
email = urls_to_emails[query_url] | |
db_inner_cursor.execute('UPDATE full_contact_info SET status=?,data=? WHERE email=?', | |
(str(response['status']).encode('utf-8'), | |
json.dumps(response).encode('utf-8'), | |
email.encode('utf-8'))) | |
else: | |
print 'bad batch response!' | |
db_connection.commit() | |
time.sleep(.1) #throttling requests for a total of 600 per minute max | |
if count % 600*BATCH_SIZE == 0: | |
print 'queried emails: {0}'.format(count) | |
#statuses should be one of 200,202,404 | |
s_list = ('200','202','404',) | |
for s in s_list: | |
total = db_connection.execute('SELECT COUNT(*) from full_contact_info where status=:status ;', | |
{'status':s}).fetchone()[0] | |
print 'total queries with {0}: {1}'.format(s,total) | |
total = db_connection.execute('SELECT COUNT(*) from full_contact_info where status!=0;').fetchone()[0] | |
print 'total nonzero status: {0}'.format(total) | |
count = count + len(rows) | |
rows = db_cursor.fetchmany(BATCH_SIZE) | |
print 'queried for {0} emails'.format(count) | |
def initial_import(db_fname,emails_fname): | |
with open(emails_fname) as f: | |
emails = f.readlines() | |
db = sqlite3.connect(db_fname) | |
cursor = db.cursor() | |
for email in emails: | |
cursor.execute('INSERT INTO full_contact_info (email,status,data) VALUES (?,?,?)', | |
(email.strip(), 0, None)); | |
db.commit() | |
db.close() | |
if __name__=='__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment