Created
October 18, 2012 16:36
-
-
Save jessitron/3913081 to your computer and use it in GitHub Desktop.
Play with the CodeMash API to analyze gender distribution of speakers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.xml._ | |
// curl http://rest.codemash.org/api/speakers > speakers.xml | |
val speakers = xml.XML.loadFile("codemashSpeakers.xml") \ "Speaker" | |
// curl http://rest.codemash.org/api/sessions > sessions.xml | |
val sessions = xml.XML.loadFile("sessions.xml") \ "Session" | |
val masculineWords = List("he","his") | |
val feminineWords = List("she","her","hers","lady") | |
def bioContainsAnyWord( strings : Seq[String], speakerNode : Node) = | |
(speakerNode \ "Biography").text.toLowerCase.split("\\b").exists(strings.contains(_)) | |
def looksFeminine(speakerNode : Node) : Boolean = { | |
bioContainsAnyWord(feminineWords, speakerNode) | |
} | |
def looksMasculine(speakerNode : Node) : Boolean = { | |
bioContainsAnyWord(masculineWords, speakerNode) | |
} | |
// while looksMasculine doesn't find all the men, looksFeminine finds all the women. | |
// There are 8 women out of 125 speakers, 6% | |
def reduceToEquivalent(name : String) :String = { | |
name match { | |
case "Michael" => "Mike" | |
case "Johnathan" => "Jon" | |
case "John" => "Jon" | |
case "Jonathan" => "Jon" | |
case "Jonny" => "Jon" | |
case "Matthew" => "Matt" | |
case "Jim" => "James" | |
case "Jimmy" => "James" | |
case "David" => "Dave" | |
case "Will" => "Bill" | |
case "William" => "Bill" | |
case "Daniel" => "Dan" | |
case "Jeffrey" => "Jeff" | |
case "Geoffrey" => "Jeff" | |
case "Geoff" => "Jeff" | |
case s => s | |
} | |
} | |
def hardOrSoft(tech : String) = { | |
tech.toLowerCase match { | |
case "software process" => "soft" | |
case "communication" => "soft" | |
case "wordpress" => "soft" | |
case "business" => "soft" | |
case "design/ux" => "soft" | |
case "craftsmanship" => "soft" | |
case "ios" => "soft" | |
case "testing" => "soft" | |
case "mac/iphone" => "hard" | |
case "other" => "hard" | |
case "clojure" => "hard" | |
case "javascript" => "hard" | |
case "scala" => "hard" | |
case "agile" => "hard" | |
case "mobile" => "hard" | |
case "ruby" => "hard" | |
case "python" => "hard" | |
case "cloud" => "hard" | |
case "other languages" => "hard" | |
case "php" => "hard" | |
case "continuous deployment" => "hard" | |
case "game development" => "hard" | |
case ".net" => "hard" | |
case "web" => "hard" | |
case "android" => "hard" | |
case "hardware" => "hard" | |
case "java" => "hard" | |
case "security" => "hard" | |
case "windows 8" => "hard" | |
case _ => println("Failure on " + tech); "unknown" | |
} | |
} | |
def getFirstName(speaker : Node) :String = { | |
reduceToEquivalent((speaker \ "Name").text.trim.split("\\b").tail.head) | |
} | |
def valuesCounted[A]( s : Seq[A]) = s.groupBy (a =>a).mapValues(_.size).toSeq.sortBy( - _._2) | |
val namesToQuantity = valuesCounted(speakers map (getFirstName(_))) | |
// There are 8 Mikes, 8 Johns, 8 women, 7 Jeffs, 5 Bills, and 5 Jims. | |
def determineGender (speaker : Node) = { if (looksFeminine(speaker)) "Woman" else "Man" } | |
val speakerUriToGender = speakers map (a => (a \ "SpeakerURI" text, determineGender(a))) toMap | |
def tech (session : Node) = session \ "Technology" text | |
def speakerGender (session : Node) = speakerUriToGender( session \ "SpeakerURI" text) | |
def techsByGender = sessions map (a => (speakerGender(a), tech(a))) groupBy { case (g,t) => t.toLowerCase } mapValues ( v => v map (_._1)) mapValues (valuesCounted(_)) | |
val techsWithWomen = techsByGender filter (_._2.exists(_._1 == "Woman")) | |
val techsWithWomenByPercentage = techsWithWomen.mapValues(a => a.toMap.getOrElse("Woman",0).asInstanceOf[Double] / a.toMap.values.sum) | |
// women are: 3 in design/UX (with 4 men, so 43%), 2 in communication (100%), 1 in business (33%), 1 in hardware (20%), and 1 in Scala (20%) (go Dianne!) | |
val techSet = sessions map (tech(_)) toSet | |
val techsWithoutWomen = techSet.filter(a => ! techsWithWomen.keySet.contains(a.toLowerCase)) | |
// so, both communication talks are by women, but 0 of .NET, JavaScript, Testing, Mac/iPhone, Mobile, Java, Ruby, Windows 8, Other, Continuous Deployment, Game Development, Python, Other Languages, Clojure, Agile, Craftsmanship, Web, Security, Cloud, PHP, Android, Wordpress, iOS | |
val hardOrSoftByGender = valuesCounted(sessions map (a => (hardOrSoft(tech(a)), speakerGender(a)))) | |
// 20% of soft sessions are by women, and under 2% of hard sessions | |
def percentage[A] (m : Map[A,Int]) = { | |
val sumValues = m.values.sum | |
m.mapValues(v => v.asInstanceOf[Double] / sumValues) | |
} | |
val womenBySessions = percentage(valuesCounted (sessions map (speakerGender(_))).toMap) | |
val womenBySpeakers = percentage(valuesCounted (speakers map (determineGender(_))).toMap) | |
// women are 6% either way |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment