Skip to content

Instantly share code, notes, and snippets.

@jessitron
Created October 18, 2012 16:36
Show Gist options
  • Save jessitron/3913081 to your computer and use it in GitHub Desktop.
Save jessitron/3913081 to your computer and use it in GitHub Desktop.
Play with the CodeMash API to analyze gender distribution of speakers
import scala.xml._
// curl http://rest.codemash.org/api/speakers > speakers.xml
val speakers = xml.XML.loadFile("codemashSpeakers.xml") \ "Speaker"
// curl http://rest.codemash.org/api/sessions > sessions.xml
val sessions = xml.XML.loadFile("sessions.xml") \ "Session"
val masculineWords = List("he","his")
val feminineWords = List("she","her","hers","lady")
def bioContainsAnyWord( strings : Seq[String], speakerNode : Node) =
(speakerNode \ "Biography").text.toLowerCase.split("\\b").exists(strings.contains(_))
def looksFeminine(speakerNode : Node) : Boolean = {
bioContainsAnyWord(feminineWords, speakerNode)
}
def looksMasculine(speakerNode : Node) : Boolean = {
bioContainsAnyWord(masculineWords, speakerNode)
}
// while looksMasculine doesn't find all the men, looksFeminine finds all the women.
// There are 8 women out of 125 speakers, 6%
def reduceToEquivalent(name : String) :String = {
name match {
case "Michael" => "Mike"
case "Johnathan" => "Jon"
case "John" => "Jon"
case "Jonathan" => "Jon"
case "Jonny" => "Jon"
case "Matthew" => "Matt"
case "Jim" => "James"
case "Jimmy" => "James"
case "David" => "Dave"
case "Will" => "Bill"
case "William" => "Bill"
case "Daniel" => "Dan"
case "Jeffrey" => "Jeff"
case "Geoffrey" => "Jeff"
case "Geoff" => "Jeff"
case s => s
}
}
def hardOrSoft(tech : String) = {
tech.toLowerCase match {
case "software process" => "soft"
case "communication" => "soft"
case "wordpress" => "soft"
case "business" => "soft"
case "design/ux" => "soft"
case "craftsmanship" => "soft"
case "ios" => "soft"
case "testing" => "soft"
case "mac/iphone" => "hard"
case "other" => "hard"
case "clojure" => "hard"
case "javascript" => "hard"
case "scala" => "hard"
case "agile" => "hard"
case "mobile" => "hard"
case "ruby" => "hard"
case "python" => "hard"
case "cloud" => "hard"
case "other languages" => "hard"
case "php" => "hard"
case "continuous deployment" => "hard"
case "game development" => "hard"
case ".net" => "hard"
case "web" => "hard"
case "android" => "hard"
case "hardware" => "hard"
case "java" => "hard"
case "security" => "hard"
case "windows 8" => "hard"
case _ => println("Failure on " + tech); "unknown"
}
}
def getFirstName(speaker : Node) :String = {
reduceToEquivalent((speaker \ "Name").text.trim.split("\\b").tail.head)
}
def valuesCounted[A]( s : Seq[A]) = s.groupBy (a =>a).mapValues(_.size).toSeq.sortBy( - _._2)
val namesToQuantity = valuesCounted(speakers map (getFirstName(_)))
// There are 8 Mikes, 8 Johns, 8 women, 7 Jeffs, 5 Bills, and 5 Jims.
def determineGender (speaker : Node) = { if (looksFeminine(speaker)) "Woman" else "Man" }
val speakerUriToGender = speakers map (a => (a \ "SpeakerURI" text, determineGender(a))) toMap
def tech (session : Node) = session \ "Technology" text
def speakerGender (session : Node) = speakerUriToGender( session \ "SpeakerURI" text)
def techsByGender = sessions map (a => (speakerGender(a), tech(a))) groupBy { case (g,t) => t.toLowerCase } mapValues ( v => v map (_._1)) mapValues (valuesCounted(_))
val techsWithWomen = techsByGender filter (_._2.exists(_._1 == "Woman"))
val techsWithWomenByPercentage = techsWithWomen.mapValues(a => a.toMap.getOrElse("Woman",0).asInstanceOf[Double] / a.toMap.values.sum)
// women are: 3 in design/UX (with 4 men, so 43%), 2 in communication (100%), 1 in business (33%), 1 in hardware (20%), and 1 in Scala (20%) (go Dianne!)
val techSet = sessions map (tech(_)) toSet
val techsWithoutWomen = techSet.filter(a => ! techsWithWomen.keySet.contains(a.toLowerCase))
// so, both communication talks are by women, but 0 of .NET, JavaScript, Testing, Mac/iPhone, Mobile, Java, Ruby, Windows 8, Other, Continuous Deployment, Game Development, Python, Other Languages, Clojure, Agile, Craftsmanship, Web, Security, Cloud, PHP, Android, Wordpress, iOS
val hardOrSoftByGender = valuesCounted(sessions map (a => (hardOrSoft(tech(a)), speakerGender(a))))
// 20% of soft sessions are by women, and under 2% of hard sessions
def percentage[A] (m : Map[A,Int]) = {
val sumValues = m.values.sum
m.mapValues(v => v.asInstanceOf[Double] / sumValues)
}
val womenBySessions = percentage(valuesCounted (sessions map (speakerGender(_))).toMap)
val womenBySpeakers = percentage(valuesCounted (speakers map (determineGender(_))).toMap)
// women are 6% either way
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment