Skip to content

Instantly share code, notes, and snippets.

@jayjacobs
Last active August 29, 2015 13:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jayjacobs/10610909 to your computer and use it in GitHub Desktop.
Save jayjacobs/10610909 to your computer and use it in GitHub Desktop.
Creating a Video from Marx data at datadrivensecurity.info
# which weeks should we look at?
whichweek <- c(27, 28, 29, 30)
# how many countries to show?
numcountry <- 25 # 20 at first
# read in marx "geo" data available from
# http://datadrivensecurity.info/blog/pages/dds-dataset-collection.html
marx <- read.csv("marx-geo.csv")
# convert datetime to POSIX date/time object
marx$datetime <- strptime(marx$datetime, format='%Y-%m-%d %H:%M:%S')
# drop any weird date formats
marx <- marx[complete.cases(marx[ ,"datetime"]),]
# filter out the weeks.
week <- factor(format(marx$datetime, "%V"))
mx <- marx[week %in% whichweek, ] # mx is subset of marx data
# divide into 5 minute chunks
mx$frame <- as.numeric(paste0(format(mx$datetime, "%m%d%H"),
sprintf("%02d", trunc(as.numeric(format(mx$datetime, "%M"))/5))))
# create the timeline to tick through.
allframes.src <- seq(min(mx$datetime), max(mx$datetime), by=300)
allframes <- as.numeric(paste0(format(allframes.src, "%m%d%H"),
sprintf("%02d", trunc(as.numeric(format(allframes.src, "%M"))/5))))
# allframes now has every possible "frame" we want to show in it.
# looking at the length will tell us how many frames we will write out.
# set up host names in "hosts"
host <- levels(mx$host)
host <- host[nchar(host)>0] # where host known
host <- sort(host, decreasing=T) # sort them
# just do country part from hostname
hostname <- sapply(host, function(x) {
paste(unlist(strsplit(x, '-'))[-1], collapse='-')
}, USE.NAMES=T)
# now just the hosts we know about
mx <- mx[mx$host %in% host, ]
# set up source country names in "cnames"
cname.tbl <- table(mx$country)
cname.tbl <- sort(cname.tbl[nchar(names(cname.tbl))>0], decreasing=T)
cnames <- sort(head(names(cname.tbl), numcountry), decreasing=T)
cnames <- head(cnames[nchar(cnames)>0], numcountry)
# setting a seed to repeat the colors
set.seed(2)
ccolor <- as.character(rainbow(length(cnames)))
names(ccolor) <- cnames
# now filter out where country is known.
mx <- mx[mx$country %in% cnames,]
# add in a frequency counter for later agregation
mx$freq <- 1
# set up dimensions of image
ht <- 1152
wt <- 1920
# set up label positions, space them evenly
cht <- ht/length(cnames)
# cpos has the country positions for y value
cpos <- seq(cht/2, ht, by=cht)
names(cpos) <- cnames
hht <- ht/length(host)
# hpos has the host positions for y value
hpos <- seq(hht/2, ht, by=hht)
names(hpos) <- host
# need to know how to scale the bar plots on the side
max.country <- max(table(mx$country))
max.host <- max(table(mx$host))
# set x position for hosts and countries
hostx <- wt-200
cx <- 200
# gap between bars in barplots
gap <- 5
################
# okay, this is where we set things up to loop on
start <- Sys.time() # timing it.
steps <- 70 # how many frame to move a ball across the screen
outdf <- data.frame() # data.frame of all balls
# data for barplot on the host side
hostbar <- data.frame(name=host, count=0,
xleft=hostx, ybottom=hpos-(hht/2)+5,
xright=hostx, ytop=hpos+(hht/2)-5, row.names=NULL)
# data for barplot on the country side
countbar <- data.frame(name=cnames, count=0,
xleft=cx, ybottom=cpos-(cht/2)+5,
xright=cx, ytop=cpos+(cht/2)-5,
color=ccolor, row.names=NULL)
date.label <- NULL
for(image in seq(length(allframes)+steps)) {
# image is the frame number we are showing.
# set "it" to be the frame ID, or zero if we are done reading in new data
it <- ifelse(image <= length(allframes), allframes[image], 0)
# test if we have any data to read for this frame
if (sum(mx$frame==it)>0) {
# update the date label (to be shown at the top)
date.label <- format(min(mx$datetime[mx$frame==it]), "%A, %B %e, %l%p")
# prep the data
# aggregate, per host and country combination
tmx <- aggregate(freq ~ host + country, data=mx[mx$frame==it, ], FUN=sum)
# foreach host+country combination, create a row in data.frame
newdf <- do.call(rbind, lapply(seq(nrow(tmx)), function(i) {
country <- as.character(tmx$country[i])
host <- as.character(tmx$host[i])
fromy <- cpos[country]
toy <- hpos[host]
data.frame(fromx=cx+(nchar(country)*11), fromy=fromy,
tox=hostx-(nchar(hostname[host])*9), toy=toy+rnorm(1, mean=0, sd=hht/6),
curx=cx+(nchar(country)*11), cury=fromy, col=ccolor[country],
size=tmx$freq[i], time=1,
host=host, country=country, row.names=NULL)
}))
# fromx, fromy: From x,y coordiates
# tox, toy: To x,y coordinates, with slight "rnorm" variation on y-value
# curx, cury: Current x,y coordinates
# size: the number of packets in that 5 minute window
# time: the "step" the ball is in
# host, country: saving for barplot counters
} else {
# else we have no new data, just make empty data.frame
newdf <- data.frame()
}
if(nrow(outdf)>0) { # we have balls in the air
# update the current value based on which step the ball is in.
outdf$curx <- ((outdf$tox - outdf$fromx) * (outdf$time/steps)) + outdf$fromx
outdf$cury <- ((outdf$toy - outdf$fromy) * (outdf$time/steps)) + outdf$fromy
outdf$time <- outdf$time + 1
# rbind the old data with new data
if (nrow(newdf)) {
outdf <- rbind(outdf, newdf)
}
} else { # fresh df
if (nrow(newdf)) {
outdf <- newdf
}
}
# set up plot
png(filename=sprintf("balls/base%04d.png", image), width=1920, height=1080)
# set small margin in inches
par(mai=c(0,0.3,0,0.3))
# open up an empty plot
plot(c(0,0), type="n", col="white", xlim=c(-1, wt), ylim=c(-1,ht+150),
yaxt="n", ann=FALSE, xaxt="n", bty="n", xaxs="i", yaxs="i")
offset <- 50
# add country labels
text(cx+5, cpos+offset, labels=cnames, cex=2, adj=0)
# add host labels
text(hostx-5, hpos+offset, labels=hostname, cex=2, adj=1)
# add the date labels
text(wt/2, ht+offset, labels=date.label, cex=4, adj=c(0.5, 0))
# stick a little URL in the corner
text(hostx, 10, labels="http://datadrivensecurity.info", cex=2, col="slateblue", adj=c(0.5,0), font=3)
# now include all the points (balls) in the plot
with(outdf, points(curx, cury+offset, type="p", pch=16, col=as.character(col), cex=sqrt(size)))
# test to see if we should increase the country barplot (look for time==1)
if (sum(outdf$time==1)>0) {
cbase <- aggregate(size ~ country, data=outdf[outdf$time==1, ], FUN=sum)
for(x in seq(nrow(cbase))) {
thisone <- which(countbar$name==as.character(cbase$country)[x])
countbar$xleft[thisone] <- countbar$xleft[thisone] - 200*(as.numeric(cbase$size[x])/max.country)
}
}
# test to see if we should increase any host barplots (look for "time" at "steps")
if (sum(outdf$time==steps)>0) {
hbase <- aggregate(size ~ host, data=outdf[outdf$time==steps, ], FUN=sum)
for(x in seq(nrow(hbase))) {
thisone <- which(hostbar$name==as.character(hbase$host)[x])
hostbar$xright[thisone] <- hostbar$xright[thisone] + 200*(as.numeric(hbase$size[x])/max.host)
}
outdf <- outdf[outdf$time<steps, ]
}
# now add the two bar plots with a "rect"
with(countbar, rect(xleft, ybottom+offset, xright, ytop+offset, col=as.character(color)))
with(hostbar, rect(xleft, ybottom+offset, xright, ytop+offset, col="steelblue"))
# close off this image
dev.off()
# include something to watch while this is running...
if (image %% 10 == 0) {
#print(outdf)
cat(image, "of", length(allframes)+steps, "\n")
}
}
end <- Sys.time()
print(end-start)
# For Reference, 8134 frames took
# Time difference of 41.80941 mins
# now we create an HD movie (hopefully) with this:
# avconv -f image2 -i balls/base%04d.png -r 25 -b 50000000 -s 1920x1080 -an test5.mp4
# though searching for "stop motion" should yield more options for you.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment