Created
December 22, 2011 12:43
-
-
Save Radagaisus/1510177 to your computer and use it in GitHub Desktop.
Correlation between NYT Headlines and Baby Names (there is none)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
Check the number of mentions of each name | |
from the baby names list in each year new york times | |
newspapers and search for a coorelation | |
""" | |
import urllib | |
import json | |
import re | |
import time | |
import numpy as np | |
from itertools import imap | |
# scumbag scipy | |
def pearsonr(x, y): | |
# Assume len(x) == len(y) | |
n = len(x) | |
sum_x = float(sum(x)) | |
sum_y = float(sum(y)) | |
sum_x_sq = sum(map(lambda x: pow(x, 2), x)) | |
sum_y_sq = sum(map(lambda x: pow(x, 2), y)) | |
psum = sum(imap(lambda x, y: x * y, x, y)) | |
num = psum - (sum_x * sum_y/n) | |
den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5) | |
if den == 0: return 0 | |
return num / den | |
def main(): | |
api_key = "your_api_key" | |
# Open the list and start iterating | |
# 2011: | |
# 1 male female | |
with open('names.txt', 'r') as names: | |
year = 0 | |
name_ranks = [] | |
results = [] | |
for line, name in enumerate(names): | |
if line == 0 or line % 50 == 0: | |
year = name.rstrip() | |
print year | |
else: | |
(ranking, name, female) = re.sub('(\s+)',',', name).rstrip(',').split(',') | |
print "Getting number of headlines for the name " + name + " for the year " + year | |
url = "http://api.nytimes.com/svc/search/v1/article?format=json&query=title%3A" + name + "&begin_date=" + year + "0101&end_date=" + year + "1231&api-key=" + api_key | |
# Call NYT | |
print "Request sent to The New York Times" | |
data = urllib.urlopen(url) | |
# Convert JSON | |
print "Response Received..." | |
print "Parsing results..." | |
result = json.loads(data.read()) | |
# Parse results | |
print "Rank: " + ranking + ", Results: " + str(result.get('total')) | |
name_ranks += [int(ranking)] | |
results += [result.get('total')] | |
print "Going to sleep.\n---------------------\n" | |
# We have a rate limit of 10 calls per second | |
time.sleep(0.2) | |
print "correlation: " | |
print pearsonr(name_ranks, results) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment