Skip to content

Instantly share code, notes, and snippets.

Created August 19, 2014 21:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/6d31d0a495f82db5748d to your computer and use it in GitHub Desktop.
Save anonymous/6d31d0a495f82db5748d to your computer and use it in GitHub Desktop.
import networkx as nx
from lxml import etree
import re
import itertools
def getNamesInAction(action,textNames,nameDict):
# go through the names, in order of length, get them from the action, then remove them before looping
act = action
sortNames = sorted(textNames, key=len, reverse=True)
returnNames = []
for nm in sortNames:
if nm in act:
returnNames.append(nameDict[nm])
act = act.replace(nm," ")
return returnNames
# obviously you point to the .htm file wherever you have it
# find the full plays at:
# http://shakespeare.mit.edu/
filename = "ShakespearGraphs/Antony and Cleopatra Entire Play.htm"
f = open(filename)
html = etree.HTML(f.read())
f.close()
# get all the characters in the play
chars = html.xpath("//b/text()")
# go through the list and make a mapping of the various spellings to the correct spelling we'll use for the graph
inTextToActual = {}
for i,c in enumerate(chars):
c2 = re.sub("\s\s+"," ",c)
c2 = c2.title()
inTextToActual[c] = c2
# the character names as we want them
charNames =list( set(inTextToActual.values()))
# the character names as they are in the text
charTextNames = list(set(inTextToActual.keys()))
# the italics elements contain all of the entrances and exits (and other things)
# go through the text and get all the actions, and the speakers that the action is associated with
italics = html.xpath("//i")
speaker = []
action = []
for i in italics:
ac = i.findtext(".")
try:
# the general stucture of the play is:
# <a>
# <b> <name of speaker> </b>
# </a>
# <blockquote>
# ....
# <p>
# <i> <the action that happens> </i>
# </p>
# </blockquote>
# so we back up from the italics to get the name of the speaker
# this lets us associate an "exit" to a character name
sp = i.getparent().getparent().getprevious().xpath("b/text()")[0]
speaker.append(sp)
except:
# in case there isn't a speaker or there is an error
speaker.append(" ")
action.append(ac)
# make a list of the groups of characters that are on stage together at the same time
# start with an empty list
charactersTogether = [[]]
ind = 0
# loop through actions and add characters to the set
# whenever a character is removed or added, we make a new set
for ac,sp in zip(action,speaker):
firstWord = re.search("(Enter|enter|Exeunt|Exit|exeunt|exit)",ac)
if firstWord is None:
continue
#print ac#,"\n\t",
firstWord = firstWord.group()
if firstWord.lower() == "enter":
# figure out who entered
entrants = getNamesInAction(ac,charTextNames,inTextToActual)
if entrants == []:
#print ""
continue
#print firstWord, "\t", entrants
prevList = list(charactersTogether[ind])
prevList.extend(entrants)
charactersTogether.append(prevList)
ind += 1
elif firstWord.lower() == "exit":
# then the character exists
# it could be the speaking character, or not
leavers = getNamesInAction(ac,charTextNames,inTextToActual)
if leavers == []:
#print "Exit \t",[sp]
leavers = [inTextToActual[sp]]
#else:
# print "Exit \t",leavers
prevChars = list(charactersTogether[ind])
# remove the characters
for c in leavers:
try:
prevChars.remove(c)
except:
continue
charactersTogether.append(prevChars)
ind += 1
elif firstWord.lower() == "exeunt":
# figure out if everyone leaves or just some characters
if ac.lower().strip() == "exeunt":
# then everyone leaves
#print "Everyone exits"
charactersTogether.append([])
ind += 1
elif "all but" in ac.lower():
# then everyone but some characters leave
nonLeavers = getNamesInAction(ac,charTextNames,inTextToActual)
charactersTogether.append(nonLeavers)
ind += 1
else:
# then the characters in the list leave
leavers = getNamesInAction(ac,charTextNames,inTextToActual)
#print "Exit \t", leavers
prevChars = list(charactersTogether[ind])
# remove the characters
for c in leavers:
try:
prevChars.remove(c)
except:
continue
charactersTogether.append(prevChars)
ind += 1
else:
print ""
# do a quick check for multiples of the same character
csToRem = []
for c in charactersTogether[ind]:
if charactersTogether[ind].count(c) > 1:
csToRem.append(c)
csToRem = list(set(csToRem))
#print csToRem
for c in csToRem:
charactersTogether[ind].remove(c)
#print "\t",charactersTogether[ind]
# with the characters listed, make the adjacency matrix
nChars = len(charNames)
adj = np.zeros((nChars,nChars))
# loop over all pairs of characters in each stage group
# make their connection in the adjancency matrix 1
# there will be tons of overlap here
for sitch in charactersTogether:
for a,b in itertools.product(sitch,sitch):
if a == b:
continue
inda = charNames.index(a)
indb = charNames.index(b)
adj[inda,indb] = 1
# make the graph and save it as a .gexf for Gephi
G = nx.Graph(adj)
G = nx.relabel_nodes(G,{i:n for i,n in enumerate(charNames)})
nx.write_gexf(G,"%s.gexf"%filename[:-4])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment