Skip to content

Instantly share code, notes, and snippets.

@psychemedia
Created October 12, 2011 11:17
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save psychemedia/1280948 to your computer and use it in GitHub Desktop.
Save psychemedia/1280948 to your computer and use it in GitHub Desktop.
Generate tag graphs from searches on Guardian Platform API
# Script to generate a co-occurring tag graph from news articles via Guardian Platform API
# This is a quick hack script - just get things done... Needs refactoring/tidying...
import simplejson,urllib,csv,sys
from itertools import combinations
# A Guardian Platfrom API key will be required...
APIKEY=''
# accept a search phrase; this will be quoted in the actual search...
try:
terms=sys.argv[1:]
except:
exit(-1)
# Create a filename that captures the search phrase
fnx='_'.join(terms)
print 'Using',fnx
# Build up the search phrase for the Guardian Platfrom API
term='"'+' '.join(terms)+'"'
enc=urllib.urlencode({'q':term})
print '...'+term+'...',enc
# Generate the URL needed to call the Guardian Platform API
# At the moment the start date and the page number is hardwired
# Need to tweak this to look at total number of results and page through them
gurl='http://content.guardianapis.com/search?' + enc + '&from-date=2011-01-01&page-size=50&format=json&show-tags=keyword&ids=type%2Farticle&api-key='+APIKEY
# We're going to write a couple of Gephi files:
# The first file is a bipartite graph from article to tag
# That is: look up the articles that mention the search phrase,
# then plot a graph of articleID-tag for each tag
f=open(fnx+'.gdf','wb')
writer = csv.writer(f)
# The second file directly links tags to the tags they co-occur with.
# That is, for each article, plot the tags associated with the article and edges between them.
f2=open(fnx+'2.gdf','wb')
writer2 = csv.writer(f2)
# Call the Guardian Platform API
data = simplejson.load(urllib.urlopen(gurl))
# This is a reminder that the results aren't paged (yet!) and you just get the most recent 50
if data['response']['total']>50: print "more if you want 'em for",term
dr=data['response']['results']
edges=[]
edges2=[]
nodes={}
nodes2={}
# For each article in the results set:
for result in dr:
# Collect a list of tags associated with the current article
tags=[]
# Build up a list of unique node IDs, firstly using article IDs for the article-tag graph
if result['id'] not in nodes:
nodes[result['id']]=( result['id'],result["webTitle"].encode('utf-8') )
# Now handle the article tags
for tag in result['tags']:
print result['id'],tag['id'],tag['webTitle']
edges.append((result['id'],tag['id']))
# Build up a list of tags associated with this article
tags.append(tag['id'])
# Add the tags to the unique list of node IDs
if tag['id'] not in nodes:
nodes[tag['id']]= ( tag['id'], tag['webTitle'] )
nodes2[tag['id']]= ( tag['id'], tag['webTitle'] )
# For the tag-tag graph, we need to list the various tag combinations for this article
combos=map(list, combinations(tags, 2))
for c in combos:
edges2.append((c[0],c[1]))
# Print out the article-tag nodelist
writer.writerow(['nodedef>name VARCHAR','label VARCHAR'])
for node in nodes:
n1,n2=nodes[node]
writer.writerow([ n1,n2 ])
# Print out the tag-tag nodelist
writer2.writerow(['nodedef>name VARCHAR','label VARCHAR'])
for node in nodes2:
n1,n2=nodes[node]
writer2.writerow([ n1, n2 ])
# Print out the article-tag edgelist
writer.writerow(['edgedef>from VARCHAR','to VARCHAR'])
for e1,e2 in edges:
writer.writerow([ e1, e2 ])
# Print out the tag-tag edgelist
writer2.writerow(['edgedef>from VARCHAR','to VARCHAR'])
for e1,e2 in edges2:
writer2.writerow([ e1, e2 ])
# Tidy up...
f.close()
f2.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment