Skip to content

Instantly share code, notes, and snippets.

@psychemedia
Created October 13, 2011 07:56
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save psychemedia/1283684 to your computer and use it in GitHub Desktop.
Save psychemedia/1283684 to your computer and use it in GitHub Desktop.
Generate tag graphs from article searches via New York Times Article API
# Script to generate a co-occurring tag graph from news articles via Guardian Platform API
# This is a quick hack script - just get things done... Needs refactoring/tidying...
import simplejson,urllib,csv,sys
from itertools import combinations
# D3.js json generator from https://bitbucket.org/hagberg/networkx-d3
import d3
import networkx as nx
# A New York Times API key will be required...
APIKEY_NYT=''
# accept a search phrase; this will be quoted in the actual search...
try:
terms=sys.argv[1:]
except:
exit(-1)
# Create a filename that captures the search phrase
fnx='nyt-'+'_'.join(terms)
print 'Using',fnx
# Build up the search phrase for the Guardian Platfrom API
term='"'+' '.join(terms)+'"'
enc=urllib.urlencode({'query':term})
print '...'+term+'...',enc
# We'll use NetworkX to construct a graph for the tag-tag network
G=nx.Graph()
# Generate the URL needed to call the New York Times Article API
# At the moment the start date and the page number is hardwired
# Need to tweak this to look at total number of results and page through them
nyturl='http://api.nytimes.com/svc/search/v1/article?'+enc+'&fields=des_facet,title,url&api-key='+APIKEY_NYT
# We're going to write a couple of Gephi files:
# The first file is a bipartite graph from article to tag
# That is: look up the articles that mention the search phrase,
# then plot a graph of articleID-tag for each tag
f=open(fnx+'.gdf','wb')
writer = csv.writer(f)
# The second file directly links tags to the tags they co-occur with.
# That is, for each article, plot the tags associated with the article and edges between them.
f2=open(fnx+'2.gdf','wb')
writer2 = csv.writer(f2)
# Call the API
dr=[]
page=0
more=True
while page<5 and more==True:
nytpurl=nyturl+'&offset='+str(page)
tmpdata = simplejson.load(urllib.urlopen(nytpurl))
page=page+1
if tmpdata['total'] < page*10: more=False
for r in tmpdata['results']: dr.append(r)
# This is a reminder that the results aren't paged (yet!) and you just get the most recent 50
edges=[]
edges2=[]
nodes={}
nodes2={}
# For each article in the results set:
for result in dr:
# Collect a list of tags associated with the current article
tags=[]
if 'des_facet' in result:
taglist=result['des_facet']
# Build up a list of unique node IDs, firstly using article IDs for the article-tag graph
if result['url'] not in nodes:
nodes[result['url']]=( result['url'],result["title"].encode('utf-8') )
# Now handle the article tags
for tag in taglist:
print result['url'],tag,tag
edges.append((result['url'],tag))
# Build up a list of tags associated with this article
tags.append(tag)
# Add the tags to the unique list of node IDs
if tag not in nodes:
nodes[tag]= ( tag, tag)
nodes2[tag]= ( tag, tag )
G.add_node(tag,label=tag)
# For the tag-tag graph, we need to list the various tag combinations for this article
combos=map(list, combinations(tags, 2))
for c in combos:
edges2.append((c[0],c[1]))
G.add_edge(c[0],c[1])
#I originally had a clunky recipe for outputting Gephi gdf files.
#As we have networkx loaded and representing the graph it much easy to write files directly:
#filepathandname='something Appropriate'
#nx.write_graphml(G, filepathandname+".graphml")
#edgelist (eg for use in R)
#nx.write_edgelist(G, filepathandname+".txt",data=False)
# Here's the clunky way of printing out the article-tag nodelist
writer.writerow(['nodedef>name VARCHAR','label VARCHAR'])
for node in nodes:
n1,n2=nodes[node]
writer.writerow([ n1,n2 ])
# Print out the tag-tag nodelist
writer2.writerow(['nodedef>name VARCHAR','label VARCHAR'])
for node in nodes2:
n1,n2=nodes[node]
writer2.writerow([ n1, n2 ])
# Print out the article-tag edgelist
writer.writerow(['edgedef>from VARCHAR','to VARCHAR'])
for e1,e2 in edges:
writer.writerow([ e1, e2 ])
# Print out the tag-tag edgelist
writer2.writerow(['edgedef>from VARCHAR','to VARCHAR'])
for e1,e2 in edges2:
writer2.writerow([ e1, e2 ])
# Tidy up...
f.close()
f2.close()
d3.draw_force(G,'force/force.json')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment