Skip to content

Instantly share code, notes, and snippets.

@ogibayashi
Created July 11, 2014 06:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ogibayashi/5cd9937fe134def925b2 to your computer and use it in GitHub Desktop.
Save ogibayashi/5cd9937fe134def925b2 to your computer and use it in GitHub Desktop.
##
## Hadoopのジョブ実行時間を可視化する
##
## 入力ファイルは、以下のように<attempt ID>,<start>,<end>のフォーマットである必要がある
##
## attempt_201405161748_0029_m_000074_0,1400232465,1400232468
## attempt_201405161748_0029_m_000072_0,1400232468,1400232474
## attempt_201405161748_0029_m_000047_0,1400232468,1400232477
## attempt_201405161748_0029_m_000048_0,1400232468,1400232482
## attempt_201405161748_0029_m_000061_0,1400232468,1400232483
library(ggplot2)
library(reshape2)
# 定数
#
# 一画面に出力するtask attemptの数
numAttemptsPerPage <- 100
# グラフ作成
generateTaskChart <- function(df,x_limits) {
attempts <- rev(unique(df[order(df$start),]$attempt))
tasks <- melt(df,id.vars="attempt")
tasks$value <- as.POSIXct(tasks$value,origin=as.Date("1970-01-01"))
tasks$type <- ifelse(grepl("_m_",tasks$attempt),"map","reduce")
g <- ggplot(tasks) + geom_line(aes(value,attempt)) +
# This cause errors like "Error in eval(expr, envir, enclos) : object 'tasks' not found"
# g <- ggplot(tasks) + geom_line(aes(value,attempt,colour=tasks$type)) +
theme(text = element_text(size=6)) + scale_y_discrete(limits=attempts) +
scale_x_datetime(limits=x_limits) +
theme(axis.text.x=element_text(angle=90),legend.position="right")
return(g)
}
arg <- commandArgs(trailingOnly=TRUE)
filename <- arg[1]
# 出力ファイル名
outputFileName <- ifelse(is.na(arg[2]), "Hadoop_task_chart.pdf", arg[2])
# 入力ファイルフォーマットは、<task attempt ID>,<start time>,<end time>
df <- read.csv(filename,stringsAsFactors=FALSE,header=FALSE)
names(df) <- c("attempt","start","end")
#df$start <- as.POSIXct(df$start,origin=as.Date("1970-01-01"))
#df$end <- as.POSIXct(df$end,origin=as.Date("1970-01-01"))
df <- df[order(df$start),]
## print(min(df$start))
## print(max(df$end))
#limits <- c(as.POSIXlt(min(df$start)),as.POSIXlt(max(df$end)))
#limits <- c(min(as.POSIXlt(df$start)),max(as.POSIXlt(df$end)))
limits <- c(min(as.POSIXct(df$start,origin=as.Date("1970-01-01"))),max(as.POSIXct(df$end,origin=as.Date("1970-01-01"))))
print(limits)
# numJobsPerPage個のタスク毎に、一つのグラフを生成
pdf(outputFileName)
for(i in 0:(round(nrow(df) / numAttemptsPerPage))){
row_start <- i*numAttemptsPerPage+1
row_end <- (i+1)*numAttemptsPerPage
row_end <- ifelse(row_end > nrow(df), nrow(df), row_end)
df_sub <- df[row_start:row_end,]
print(generateTaskChart(df_sub,limits))
}
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment