Skip to content

Instantly share code, notes, and snippets.

@rlugojr
Forked from josdirksen/DWCounter.scala
Created July 8, 2016 17:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rlugojr/93a90373d7ccf5edb361fe02e92479ee to your computer and use it in GitHub Desktop.
Save rlugojr/93a90373d7ccf5edb361fe02e92479ee to your computer and use it in GitHub Desktop.
package org.smartjava.dw
import java.io.{File, FileInputStream}
import com.github.mtailor.srtdissector.SrtDissector
import scala.annotation.tailrec
import scala.collection.immutable.ListMap
import scala.io.Source
import scala.util.{Failure, Success}
object DWCounter extends App {
val dws = Source.fromFile(getClass.getClassLoader.getResource("dwList.txt").toURI).getLines().toList
val subs = new File(getClass.getClassLoader.getResource("srt/wolfofwallstreet.srt").toURI)
val dissector = SrtDissector(new FileInputStream(subs))
case class TimedWord(word: String, time: Double)
val n = dissector match {
case Success(srt) =>
// add timing information to the individual words
// approximation, just calculating total time of block, divided by each word
srt.map( srtBlock => {
val totalTime = srtBlock.end - srtBlock.start;
val asWords = toWords(srtBlock.lines)
val totalWordSize = asWords.mkString("").size
val timePerCharacter = totalTime.toDouble / totalWordSize
@tailrec
def toTimedWords(words: List[String], currentTime: Double, totalWords: List[TimedWord]) : List[TimedWord] = {
words match {
case head :: tail =>
val endTime = currentTime + head.size * timePerCharacter
toTimedWords(tail, endTime, TimedWord(head, currentTime) :: totalWords )
case nil => totalWords
}
}
toTimedWords(asWords.toList, srtBlock.start.toDouble, List())
}).flatten
case Failure(e) => println(s"Failed to parse srt: ${e.getMessage}.") ; throw(e)
}
val maxTime = n.foldLeft(0d)((z, el) => {
if (z < el.time) el.time else z
})
println("Words with 'fuck' in them: " + n.filter(_.word.contains("fuck")).length)
println("Words with 'bitch' in them: " + n.filter(_.word.contains("bitch")).length)
val counted = n.foldLeft(Map[String, Int]()) { (z, word) =>
if (dws.contains(word.word)) {
val count = z.getOrElse(word.word, 0) + 1
z + (word.word -> count)
} else {
z
}
}
val sorted = ListMap(counted.toSeq.sortBy(_._2).reverse:_*)
println(sorted);
// output as a simple csv
val filtered = n.filter(timedWord => dws.contains(timedWord.word)).foreach(line => {
println(s"${line.word},${(line.time/1000).toLong}")
})
val groupoPerSeconds = 60
val groupedMinute = n.filter(timedWord => dws.contains(timedWord.word)).groupBy( w =>
Math.floor((w.time / 1000) / groupoPerSeconds).toLong
)
println(groupedMinute)
val countedPerMinute = groupedMinute.map {
case (t, list) => (t -> list.length)
}
0l to (maxTime / (1000 * 60)).toLong + 2 foreach ((i) => {
val toPrint = countedPerMinute.getOrElse(i, 0)
println(s"$i,$toPrint")
})
@tailrec
def countSwears(words: Seq[String], currentTime: Long, currentCount: Map[String, Int]) : Map[String, Int] = {
words match {
case head :: tail => if (dws.contains(head)) {
val newCount = currentCount.getOrElse(head,0) + 1
return countSwears(tail, 0, currentCount + (head -> newCount))
} else {
// println(head)
return countSwears(tail, 0, currentCount)
}
case Nil => return currentCount
}
}
def toWords(lines: Seq[String]) = lines flatMap { line =>
"[a-zA-Z]+".r findAllIn line map (_.toLowerCase)
}
}
<!DOCTYPE html>
<meta charset="utf-8">
<head>
<style>
#chart {
margin-left: 50px;
}
rect.bordered {
stroke: #E6E6E6;
stroke-width:1px;
rx: 2px;
ry: 2px;
}
text.mono {
font-size: 9pt;
font-family: Consolas, courier;
fill: #aaa;
}
.title {
font-size: 20px;
font-family: sans-serif;
}
</style>
<script src="//d3js.org/d3.v3.min.js"></script>
<script src="js/colorbrewer.min.js"></script>
</head>
<body>
<div style="margin: 0px; column-count: 3">
<div id="chart"></div>
</div>
<script type="text/javascript">
var margin = { top: 60, right: 0, bottom: 100, left: 50 },
width = 700 - margin.left - margin.right,
height = 330 - margin.top - margin.bottom,
gridSize = Math.floor(width / 34),
legendElementWidth = gridSize*2,
buckets = 9,
colors = colorbrewer.OrRd[9];
datasets = ["../resources/results/swearsPerMinuteLoebowski.csv"];
var addDiagram = function(tsvFile, title) {
var svg = d3.select("#chart").append("svg")
.attr("width", width + margin.left + margin.right)
.attr("height", height + margin.top + margin.bottom)
var chartGroup = svg.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");
var textGroup = svg.append("g");
d3.csv(tsvFile,
function(d) {
var minute = +d.time;
var row = Math.floor(minute / 24);
var column = minute % 24;
return {
day: row,
minute: column,
value: +d.swears
};
},
function(error, data) {
// we could use max to create a more specific range.
// var max = d3.max(data, function (d) { return d.value; });
var totalHeight = (Math.ceil(data.length/30)) * (gridSize) + margin.top + margin.bottom;
svg.attr('height', totalHeight);
// a colorscale for grouping the colors
var colorScale = d3.scale.quantile().domain([0, buckets - 1, 30]).range(colors);
var spm = chartGroup.selectAll(".minute")
.data(data, function(d) {return d.day+':'+d.minute;});
spm.enter().append("rect")
.attr("x", function(d) { return (d.minute - 1) * gridSize; })
.attr("y", function(d) { return (d.day - 1) * gridSize; })
.attr("rx", 4)
.attr("ry", 4)
.attr("class", "minute bordered")
.attr("width", gridSize)
.attr("height", gridSize)
.style("fill", colors[0]);
spm.transition().duration(1000)
.style("fill", function(d) { return colorScale(d.value); });
spm.exit().remove();
var legend = chartGroup.selectAll(".legend")
.data([0].concat(colorScale.quantiles()), function(d) { return d; });
legend.enter().append("g")
.attr("class", "legend");
var legendPos = ((Math.ceil(data.length/30) + 2) * (gridSize));
legend.append("rect")
.attr("x", function(d, i) { return legendElementWidth * i; })
.attr("y", legendPos)
.attr("width", legendElementWidth)
.attr("height", gridSize / 2)
.style("fill", function(d, i) { return colors[i]; });
legend.append("text")
.attr("class", "mono")
.text(function(d) { return "≥ " + Math.round(d); })
.attr("x", function(d, i) { return legendElementWidth * i; })
.attr("y", legendPos + gridSize);
legend.exit().remove();
textGroup.append("text")
.attr("x", 0)
.attr("y", 20)
.attr("class", "title")
.text(title);
});
};
addDiagram("../resources/results/swearsPerMinuteStraightOutOfCompton.csv", "Straight out of Compton");
addDiagram("../resources/results/deadpool.csv", "Deadpool");
addDiagram("../resources/results/swearsPerMinuteLoebowski.csv", "The Big Lebowski");
addDiagram("../resources/results/swaersPerMinuteSwearnet.csv", "Swearnet: the Movie");
addDiagram("../resources/results/swearsPerTwoMinutesWolf.csv", "Wolf of Wallstreet");
addDiagram("../resources/results/swearsPerMinuteStarwarsForce.csv", "Star-Wars: The Force Awakens");
addDiagram("../resources/results/swearsPerMinuteJurrasicWorld.csv", "Jurassic World");
</script>
</body>
<!DOCTYPE html>
<meta charset="utf-8">
<style>
.bar {
fill: gray;
stroke: black;
}
.bar:hover {
fill: brown;
}
.axis {
font: 10px sans-serif;
}
.axis path,
.axis line {
fill: none;
stroke: #000;
shape-rendering: crispEdges;
}
.x.axis path {
display: none;
}
</style>
<body>
<script src="//d3js.org/d3.v3.min.js"></script>
<script>
function addDiagram(source, title) {
var margin = {top: 20, right: 20, bottom: 30, left: 40},
width = 960 - margin.left - margin.right,
height = 150 - margin.top - margin.bottom;
var x = d3.scale.ordinal().range([width, 0])
.rangeBands([0, width], .1);
var y = d3.scale.linear()
.range([height, 0]);
var xAxis = d3.svg.axis()
.scale(x)
.tickFormat(function (d) { console.log(d) ; return (+d % 5 === 0) ? d : '' })
.orient("bottom");
var yAxis = d3.svg.axis()
.scale(y)
.orient("left")
var svg = d3.select("body").append("svg")
.attr("width", width + margin.left + margin.right)
.attr("height", height + margin.top + margin.bottom)
.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");
d3.csv(source, type, function(error, data) {
if (error) throw error;
x.domain(data.map(function(d) { console.log(d) ;return +d.time; }));
y.domain([0, 30]);
svg.append("text")
.attr("x", (width - 150))
.attr("y", 15)
.attr("class", "axis")
.style("font-size", "12px")
.text(title);
svg.append("g")
.attr("class", "x axis")
.attr("transform", "translate(0," + height + ")")
.call(xAxis);
svg.append("g")
.attr("class", "y axis")
.call(yAxis)
.append("text")
.attr("transform", "rotate(-90)")
.attr("y", 6)
.attr("dy", ".71em")
.style("text-anchor", "end")
.text("Swears / minute");
svg.selectAll(".bar")
.data(data)
.enter().append("rect")
.attr("class", "bar")
.attr("x", function(d) { return x(d.time); })
.attr("width", x.rangeBand())
.attr("y", function(d) { return y(d.swears); })
.attr("height", function(d) { return height - y(d.swears); });
});
}
function type(d) {
d.swears = +d.swears;
return d;
}
addDiagram("../resources/results/swearsPerTwoMinutesWolf.csv", "Wolf of Wallstreet");
addDiagram("../resources/results/swearsPerMinuteStraightOutOfCompton.csv", "Straight out of Compton");
addDiagram("../resources/results/swaersPerMinuteSwearnet.csv", "Swearnet: the Movie");
addDiagram("../resources/results/swearsPerMinuteLoebowski.csv", "The Big Lebowski");
addDiagram("../resources/results/swearsPerMinuteStarwarsForce.csv", "Star-Wars: The Force Awakens");
addDiagram("../resources/results/swearsPerMinuteJurrasicWorld.csv", "Jurassic World");
</script>
</body>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment