kwbr/ g1_log_parser.sh

## g1_log_parser.sh
#!/usr/bin/env awk -f

# Based on the idea from https://blogs.oracle.com/taylor22/entry/using_r_to_analyze_g1gc, the
# script is updated to use the format of the gc logs as received with the parameters:
# -XX:+UseThreadPriorities
# -XX:ThreadPriorityPolicy=42
# -Xms1995M -Xmx1995M
# -Xss256k -XX:StringTableSize=1000003
# -XX:SurvivorRatio=8
# -XX:MaxTenuringThreshold=1
# -XX:+UseTLAB
# -XX:+ResizeTLAB
# -XX:CompileCommandFile=/hotspot_compiler
# -XX:MaxGCPauseMillis=500
# -XX:InitiatingHeapOccupancyPercent=25
# -XX:+UseG1GC
# -XX:G1RSetUpdatingPauseTimePercent=5
# -XX:+ParallelRefProcEnabled
# -XX:ParallelGCThreads=2
# -XX:ConcGCThreads=2
# -XX:+PrintGCDetails
# -XX:+PrintGCDateStamps
# -XX:+PrintHeapAtGC
# -XX:+PrintTenuringDistribution
# -XX:+PrintGCApplicationStoppedTime
# -XX:+PrintPromotionFailure
# -XX:PrintFLSStatistics=1
# -Xloggc:/<valid path>/gc.log
# -XX:+UseGCLogFileRotation
# -XX:NumberOfGCLogFiles=10
# -XX:GCLogFileSize=10M

function parse_size(val)
{

   ######################
   # Standardizes all the values of memory sizes to GB, gc log will have the scalar value and the
   # measurement unit ("B", "K", "M") referring to Bytes
   # @param val - string with the value as received on the log. (i.e "9437184K")
   # @returns float with the size in GB
   ######################

   if ( split(val, b, "B") > 1 ) {result=b[1]/(1024.0^3);}
   if ( split(val, b, "K") > 1 ) {result=b[1]/(1024.0^2);}
   if ( split(val, b, "M") > 1 ) {result=b[1]/1024.0;}
   if ( split(val, b, "G") > 1 ) {result=b[1];}
   return result;
}

BEGIN {
   printf("Timestamp SecondsSinceLaunch IncrementalCount UserTime SysTime RealTime EdenBeforeSize EdenAfterSize SurvivorBefore SurvivorAfter HeapBeforeSize HeapAfterSize ParallelTime ERSAvg ERSSum URSAvg URSSum SRSAvg SRSSum CRTAvg CRTSum GCObjCopyAvg GCObjCopySum GCTerminationAvg GCTerminationSum GCOtherAvg GCOtherSum GCTotalAvg GCTotalSum\n")
}

######################
# Save count data from lines that are at the start of each G1GC trace.
# Each trace starts out like this:
#   {Heap before GC invocations=14 (full 0):
#    garbage-first heap   total 9437184K, used 325496K [0xfffffffd00000000, 0xffffffff40000000, 0xffffffff40000000)
######################
/{Heap.*full/{
   gsub ( "\\)" , "" );
   nf=split($0,a,"=");
   split(a[2],b," ");
   getline;
   if ( match($0, "first") ) {
  G1GC=1;
  IncrementalCount=b[1];
  FullCount=substr( b[3], 1, length(b[3])-1 );
   } else {
       G1GC=0;
   }
}

######################
# Retrieves information of the actions and where the GC is spending more time.
# It considers the following format
# [Parallel Time: 50.6 ms, GC Workers: 2]
#   [GC Worker Start (ms): Min: 2645.0, Avg: 2645.0, Max: 2645.0, Diff: 0.0]
#   [Ext Root Scanning (ms): Min: 2.3, Avg: 2.5, Max: 2.6, Diff: 0.3, Sum: 4.9]
#   [Update RS (ms): Min: 4.9, Avg: 4.9, Max: 5.0, Diff: 0.1, Sum: 9.8]
#      [Processed Buffers: Min: 2, Avg: 2.5, Max: 3, Diff: 1, Sum: 5]
#   [Scan RS (ms): Min: 6.8, Avg: 7.0, Max: 7.2, Diff: 0.3, Sum: 14.0]
#   [Code Root Scanning (ms): Min: 0.0, Avg: 0.7, Max: 1.3, Diff: 1.3, Sum: 1.4]
#   [Object Copy (ms): Min: 34.9, Avg: 35.5, Max: 36.1, Diff: 1.2, Sum: 70.9]
#   [Termination (ms): Min: 0.0, Avg: 0.0, Max: 0.0, Diff: 0.0, Sum: 0.0]
#      [Termination Attempts: Min: 1, Avg: 1.0, Max: 1, Diff: 0, Sum: 2]
#   [GC Worker Other (ms): Min: 0.0, Avg: 0.0, Max: 0.0, Diff: 0.0, Sum: 0.0]
#   [GC Worker Total (ms): Min: 50.5, Avg: 50.5, Max: 50.5, Diff: 0.0, Sum: 101.0]
#   [GC Worker End (ms): Min: 2695.5, Avg: 2695.5, Max: 2695.5, Diff: 0.0]
######################
/\[Parallel Time:.*]$/{
   split($0,pt," ");
   ParallelTime=pt[3];
}
/Ext Root Scanning/{
   gsub( ",|]", " ");
   split($0,ers," ");
   ERS_Min=ers[6];
   ERS_Avg=ers[8];
   ERS_Max=ers[10];
   ERS_Sum=ers[14];
}
/Update RS \(ms\)\:/{
   gsub( ",|]", " ");
   split($0,urs," ");
   URS_Min=urs[5];
   URS_Avg=urs[7];
   URS_Max=urs[9];
   URS_Sum=urs[13];
}
/Scan RS \(ms\)\:/{
   gsub( ",|]", " ");
   split($0,srs," ");
   SRS_Min=srs[5];
   SRS_Avg=srs[7];
   SRS_Max=srs[9];
   SRS_Sum=srs[13];
}
/Code Root Scanning \(ms\)\:/{
   gsub( ",|]", " ");
   split($0,crt," ");
   CRT_Min=crt[6];
   CRT_Avg=crt[8];
   CRT_Max=crt[10];
   CRT_Sum=crt[14];
}
/Object Copy \(ms\)\:/{
   gsub( ",|]", " ");
   split($0,oc," ");
   GC_obj_copy_Min=oc[5];
   GC_obj_copy_Avg=oc[7];
   GC_obj_copy_Max=oc[9];
   GC_obj_copy_Sum=oc[13];
}
/Termination \(ms\)\:/{
   gsub( ",|]", " ");
   split($0,term," ");
   GC_Termination_Min=term[4];
   GC_Termination_Avg=term[6];
   GC_Termination_Max=term[8];
   GC_Termination_Sum=term[12];
}
/GC Worker Other \(ms\)\:/{
   gsub( ",|]", " ");
   split($0,oth," ");
   GC_worker_other_Min=oth[6];
   GC_worker_other_Avg=oth[8];
   GC_worker_other_Max=oth[10];
   GC_worker_other_Sum=oth[14];
}
/GC Worker Total \(ms\)\:/{
   gsub( ",|]", " ");
   split($0,tot," ");
   GC_worker_total_Min=tot[6];
   GC_worker_total_Avg=tot[8];
   GC_worker_total_Max=tot[10];
   GC_worker_total_Sum=tot[14];
}
######################
# Pull out time stamps that are in lines with this format:
#   2014-05-12T14:02:06.025-0700: 94.312: [GC pause (young), 0.08870154 secs]
######################
/GC pause/ {
   DateTime=substr($1, 0, length($1)-1);
   split($2, a, ":")
   SecondsSinceLaunch=a[1]
}

######################
# Heap sizes are in lines that look like this:
#  [ 4842M(9216M)->4838M(9216M)]
######################
/\[Eden:.*]$/ {
   gsub ( "\\[" , "" );
   gsub ( "\\]" , "" );
   gsub ( "->" , " " );
   gsub ( "\\(" , " " );
   gsub ( "\\)" , " " );
   gsub ( "Survivors:", "" );
   gsub ( "Heap:", "" );

   split($0,a," ");

   EdenBeforeSize=parse_size(a[2]);
   EdenBeforeCap=parse_size(a[3]);
   EdenAfterSize=parse_size(a[4]);
   EdenAfterCap=parse_size(a[5]);

   SurvBeforeSize=parse_size(a[6]);
   SurvAfterSize=parse_size(a[7]);

   HeapBeforeSize=parse_size(a[8]);
   HeapBeforeCap=parse_size(a[9]);
   HeapAfterSize=parse_size(a[10]);
   HeapAfterCap=parse_size(a[11]);
}

######################
# Emit an output line when you find input that looks like this:
#   [Times: user=1.41 sys=0.08, real=0.24 secs]
######################
/\[Times/ {
   if (G1GC==1) {
  gsub ( "," , "" );
  split($2,a,"=");
  UserTime=a[2];
  split($3,a,"=");
  SysTime=a[2];
  split($4,a,"=");
  RealTime=a[2];
  print DateTime,SecondsSinceLaunch,IncrementalCount,UserTime,SysTime,RealTime,EdenBeforeSize,EdenAfterSize,SurvBeforeSize,SurvAfterSize,HeapBeforeSize,HeapAfterSize,ParallelTime,ERS_Avg,ERS_Sum,URS_Avg,URS_Sum,SRS_Avg,SRS_Sum,CRT_Avg,CRT_Sum,GC_obj_copy_Avg,GC_obj_copy_Sum,GC_Termination_Avg,GC_Termination_Sum,GC_worker_other_Avg,GC_worker_other_Sum,GC_worker_total_Avg,GC_worker_total_Sum,IPAddress;
   }
}

## README.txt
Before analysing, the recording of the gc log information should be enabled with the JVM arguments
-XX:+PrintGCDetails
-XX:+PrintGCDateStamps
-XX:+PrintHeapAtGC
-XX:+PrintTenuringDistribution
-XX:+PrintGCApplicationStoppedTime
-XX:+PrintPromotionFailure
-XX:PrintFLSStatistics=1
-Xloggc:/<valid path>/gc.log

# The following are optional
-XX:+UseGCLogFileRotation
-XX:NumberOfGCLogFiles=10
-XX:GCLogFileSize=10M


There are 2 main components:
1)  g1_log_parser.sh which parses the logs (the path for the logs was defined with the -Xloggc JVM argument)
    example
    ./g1_log_parser.sh /<valid path>/gc.log > data/gc_log_parsed

2) Execute R scripts
  - R framework can be downloaded from https://www.r-project.org/;
  - it is suggested to use RStudio as an IDE for the scripts https://www.rstudio.com/products/rstudio/download/, optionally there are also plugins for IntelliJ (R4Intellij) and Eclipse (StatET) that are able to interpret and execute R scripts.
  - it is required to install ggplot2 executing on R
          install.packages('ggplot2')

## duration_histogram.R
library(ggplot2)

# Presents an histogram for the time required to complete a garbage collection action
# Assumes that g1gc has been already initialized, as on g1_gc_analysis.R
source('multiplot.R')

border_color <- 'black'
label_x <- 'Time (seconds)'
label_y <- 'Frequency'

# Real time is the total time taken to execute the garbage collection
title_hist_1 = paste(variable_study,"Real Time")
freq_hist_real <- ggplot(data=g1gc, aes(g1gc$RealTime)) +
       geom_histogram(col=border_color, aes(fill=..count..), bins = 15) +
       labs(x = label_x, y = label_y) +
       ggtitle(paste(title_hist_1, subtitle))


# User time considers only the time when operations occurred on the User space
title_hist_2 = paste(variable_study,"User Time")
freq_hist_user <- ggplot(data=g1gc, aes(g1gc$UserTime)) +
       geom_histogram(col=border_color, aes(fill=..count..), bins = 15) +
       labs(x = label_x, y = label_y) +
       ggtitle(paste(title_hist_2, subtitle))

# Sys (System time) considers the time when operations occurred on the system space (i.e.: kernel
#  operations)
title_hist_3 = paste(variable_study,"Sys Time")
freq_hist_sys <- ggplot(data=g1gc, aes(g1gc$SysTime)) +
       geom_histogram(col=border_color, aes(fill=..count..), bins = 15) +
       labs(x = label_x, y = label_y) +
       ggtitle(paste(title_hist_3, subtitle))


multiplot(freq_hist_real, freq_hist_user, freq_hist_sys, cols = 1)

## g1_gc_analysis.R
# work based on
# https://blogs.oracle.com/taylor22/entry/using_r_to_analyze_g1gc

# For more information regarding G1
# https://blogs.oracle.com/poonam/entry/understanding_g1_gc_logs

# Initializes the variable that contains the data, and calls the different scripts that contains the
# graphs to display

# Required for compatibility with the R4IntelliJ plugin
setwd('~/repo/db/tools/g1_analysis')
datetime_format <- "%Y-%m-%dT%H:%M:%OS%z"

# What is the variable that we are studying? This label will be appended on the title of all the
# graphs obtained, this will help to make easier the research

# Define the title that will appear on the plots, we usually set it to the environment and Cassandra cluster where the data was originated
variable_study <- "<Environment> <stack> -"

# Time range to apply for the test. Note that the times on date series will still be presented in UTC
lower_date <- "2016-05-12T00:00:00.000-0400"
upper_date <- "2016-06-18T00:00:00.000-0400"
timezone <- "America/New_York"

subtitle = paste("\n", lower_date, " - ", upper_date)

# Update the path to the location of the output from the logger
data_source <- "data/<input_parsed>.log"

g1gc_all <- read.csv(data_source, row.names = NULL, stringsAsFactors=FALSE, sep=" ")

g1gc <- subset(g1gc_all, as.POSIXlt(strptime(g1gc_all$Timestamp, datetime_format)) >= as.POSIXlt(strptime(lower_date, datetime_format)) &
                as.POSIXlt(strptime(g1gc_all$Timestamp, datetime_format)) <= as.POSIXlt(strptime(upper_date, datetime_format)))

source('duration_histogram.R')
source('memory_composition.R')
source('gc_phases.R')

## gc_phases.R
library(ggplot2)

# Presents the comparison of before and after the garbage collection is executed on the different
# sections of the memory.
# Assumes that g1gc has been already initialized, as on g1_gc_analysis.R
# More information of the meaning of the values is available at
# https://blogs.oracle.com/poonam/entry/understanding_g1_gc_logs
# https://blogs.oracle.com/g1gc/entry/g1gc_logs_how_to_print
source('multiplot.R')

# Constants
datetime_format <- "%Y-%m-%dT%H:%M:%S"

timeline_comp_fields <- c('Timestamp', 'GCTotalAvg', 'ERSAvg', 'URSAvg', 'SRSAvg', 'CRTAvg',
       'GCObjCopyAvg', 'GCTerminationAvg', 'GCOtherAvg')
pie_phases_fields <-  c('ERSSum', 'URSSum', 'SRSSum', 'CRTSum', 'GCObjCopySum', 'GCTerminationSum',
       'GCOtherSum')
pie_phases_desc <-  c('External Root Scanning', 'Remembered Sets - Update',
       'Remembered Sets - Scan', 'Code Root Scanning', 'Objects Copy', 'Termination', 'Other')

timeline_comp_values <- g1gc[timeline_comp_fields]

grandtotal <- colSums(g1gc["GCTotalSum"])
pie_phases_aggr_values <- colSums(g1gc[pie_phases_fields]) * 100 / grandtotal

pie_phases_values <- data.frame(pie_phases_desc, pie_phases_aggr_values)
colnames(pie_phases_values) <- c("name", "sum")

# Models the distribution of GC phases across the studies time
title_gc_1 <- paste(variable_study, "GC phases composition")

comp <- ggplot() +
   geom_line(data = timeline_comp_values,
             aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
                 y = timeline_comp_values$GCTotalAvg, color = "Total")) +
   geom_line(data = timeline_comp_values,
             aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
                 y = timeline_comp_values$ERSAvg, color = "External Root Scanning")) +
   geom_line(data = timeline_comp_values,
             aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
                 y = timeline_comp_values$URSAvg, color = "Remembered Sets - Update")) +
   geom_line(data = timeline_comp_values,
             aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
                 y = timeline_comp_values$SRSAvg, color = "Remembered Sets - Scan")) +
   geom_line(data = timeline_comp_values,
             aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
                 y = timeline_comp_values$CRTAvg, color = "Code Root Scanning")) +
   geom_line(data = timeline_comp_values,
             aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
                 y = timeline_comp_values$GCObjCopyAvg, color = "Objects copy")) +
   geom_line(data = timeline_comp_values,
             aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
                 y = timeline_comp_values$GCTerminationAvg, color = "Termination")) +
   geom_line(data = timeline_comp_values,
             aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
                 y = timeline_comp_values$GCOtherAvg, color = "Other")) +
   labs(x = "Date time (UTC)", y = "Time (ms.)") +
   ggtitle(paste(title_gc_1, subtitle))


# Pie chart to show the distribution of the time spent in the different phases (actions)
# that are taken by Garbage Collection.
title_gc_2 <- paste(variable_study, "GC phase composition")
pie_phase <- ggplot(pie_phases_values, aes (x="",  y=pie_phases_values$sum, fill=name)) +
   geom_bar(width = 1, stat = "identity", color = "black") +
   coord_polar(theta = "y") +
   labs(x = "", y = "Percentage (%)") +
   ggtitle(paste(title_gc_2, subtitle))

multiplot(comp, pie_phase, cols = 1)

## memory_composition.R
library(ggplot2)

# Presents the comparison of begore and after the garbage collection is executed on the different
# sections of the memory.
# Assumes that g1gc has been already initialized, as on g1_gc_analysis.R
source('multiplot.R')

# Constants
datetime_format <- "%Y-%m-%dT%H:%M:%S"
label_x <- 'Date time (UTC)'
# Note that log_parser.h standardizes to GB
label_y <- 'Size (GB)'

# Heap is the total memory size used
title_mem_1 = paste(variable_study,"Heap size")
heap <- ggplot() +
       geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
               y = g1gc$HeapBeforeSize, color = "before")) +
       geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
               y = g1gc$HeapAfterSize, color = "after")) +
       labs(x = label_x, y = label_y) +
       ggtitle(paste(title_mem_1, subtitle))

# Eden refers to the memory pool where most of the new objects are created. After a garbage
# collection, it is expected that it will decrease (as objects are removed or promoted to
# "survivor" state.
title_mem_2 = paste(variable_study,"Eden size")
eden <- ggplot() +
       geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
               y = g1gc$EdenBeforeSize, color = "before")) +
       geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
               y = g1gc$EdenAfterSize, color = "after")) +
       labs(x = label_x, y = label_y) +
       ggtitle(paste(title_mem_2, subtitle))

# Survivor, as the name proposes, refers to the memory space where objects that survive gc
# operations that were previously on Eden. This is expected to increase after a GC execution.
title_mem_3 = paste(variable_study,"Survivor size")
survivor <- ggplot() +
       geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
               y = g1gc$SurvivorBefore, color = "before")) +
       geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
               y = g1gc$SurvivorAfter, color = "after")) +
       labs(x = label_x, y = label_y) +
       ggtitle(paste(title_mem_3, subtitle))

multiplot(heap, eden, survivor, cols = 1)

## multiplot.R
# Allows to present multiple graphs on the same canvas. Please note that it won't
# modify the logic of the plots received, but the size and proportions of the charts
# will be updated to fit them (with their legend boxes if present). Also this may
# depend on the resolution of the output device used. Most of the times
# with the default value of 1 for the cols will be fine.
# from http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
 # Allows present on the canvas of the output device, different charts (plots) on
 # the same canvas.
 # @param ... plots to be included on the canvas
 # @param plotlist optionally, the plots can be defined as a data frame that contains
 #           the graphs to show, this will be merged with the graphs received above
 # @param cols number of columns to used on the template
 # @param optionally the layout can be precalculated and applied, this option is useful
 #            if themes will be used
 library(grid)

 # Make a list from the ... arguments and plotlist
 plots <- c(list(...), plotlist)

 numPlots = length(plots)

 # If layout is NULL, then use 'cols' to determine layout
 if (is.null(layout)) {
   # Make the panel
   # ncol: Number of columns of plots
   # nrow: Number of rows needed, calculated from # of cols
   layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
 }

 if (numPlots==1) {
   print(plots[[1]])
 } else {
   # Set up the page
   grid.newpage()
   pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

   # Make each plot, in the correct location
   for (i in 1:numPlots) {
     # Get the i,j matrix positions of the regions that contain this subplot
     matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

     print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                     layout.pos.col = matchidx$col))
   }
 }
}
	#!/usr/bin/env awk -f

	# Based on the idea from https://blogs.oracle.com/taylor22/entry/using_r_to_analyze_g1gc, the
	# script is updated to use the format of the gc logs as received with the parameters:
	# -XX:+UseThreadPriorities
	# -XX:ThreadPriorityPolicy=42
	# -Xms1995M -Xmx1995M
	# -Xss256k -XX:StringTableSize=1000003
	# -XX:SurvivorRatio=8
	# -XX:MaxTenuringThreshold=1
	# -XX:+UseTLAB
	# -XX:+ResizeTLAB
	# -XX:CompileCommandFile=/hotspot_compiler
	# -XX:MaxGCPauseMillis=500
	# -XX:InitiatingHeapOccupancyPercent=25
	# -XX:+UseG1GC
	# -XX:G1RSetUpdatingPauseTimePercent=5
	# -XX:+ParallelRefProcEnabled
	# -XX:ParallelGCThreads=2
	# -XX:ConcGCThreads=2
	# -XX:+PrintGCDetails
	# -XX:+PrintGCDateStamps
	# -XX:+PrintHeapAtGC
	# -XX:+PrintTenuringDistribution
	# -XX:+PrintGCApplicationStoppedTime
	# -XX:+PrintPromotionFailure
	# -XX:PrintFLSStatistics=1
	# -Xloggc:/<valid path>/gc.log
	# -XX:+UseGCLogFileRotation
	# -XX:NumberOfGCLogFiles=10
	# -XX:GCLogFileSize=10M

	function parse_size(val)
	{

	######################
	# Standardizes all the values of memory sizes to GB, gc log will have the scalar value and the
	# measurement unit ("B", "K", "M") referring to Bytes
	# @param val - string with the value as received on the log. (i.e "9437184K")
	# @returns float with the size in GB
	######################

	if ( split(val, b, "B") > 1 ) {result=b[1]/(1024.0^3);}
	if ( split(val, b, "K") > 1 ) {result=b[1]/(1024.0^2);}
	if ( split(val, b, "M") > 1 ) {result=b[1]/1024.0;}
	if ( split(val, b, "G") > 1 ) {result=b[1];}
	return result;
	}

	BEGIN {
	printf("Timestamp SecondsSinceLaunch IncrementalCount UserTime SysTime RealTime EdenBeforeSize EdenAfterSize SurvivorBefore SurvivorAfter HeapBeforeSize HeapAfterSize ParallelTime ERSAvg ERSSum URSAvg URSSum SRSAvg SRSSum CRTAvg CRTSum GCObjCopyAvg GCObjCopySum GCTerminationAvg GCTerminationSum GCOtherAvg GCOtherSum GCTotalAvg GCTotalSum\n")
	}

	######################
	# Save count data from lines that are at the start of each G1GC trace.
	# Each trace starts out like this:
	# {Heap before GC invocations=14 (full 0):
	# garbage-first heap total 9437184K, used 325496K [0xfffffffd00000000, 0xffffffff40000000, 0xffffffff40000000)
	######################
	/{Heap.*full/{
	gsub ( "\\)" , "" );
	nf=split($0,a,"=");
	split(a[2],b," ");
	getline;
	if ( match($0, "first") ) {
	G1GC=1;
	IncrementalCount=b[1];
	FullCount=substr( b[3], 1, length(b[3])-1 );
	} else {
	G1GC=0;
	}
	}

	######################
	# Retrieves information of the actions and where the GC is spending more time.
	# It considers the following format
	# [Parallel Time: 50.6 ms, GC Workers: 2]
	# [GC Worker Start (ms): Min: 2645.0, Avg: 2645.0, Max: 2645.0, Diff: 0.0]
	# [Ext Root Scanning (ms): Min: 2.3, Avg: 2.5, Max: 2.6, Diff: 0.3, Sum: 4.9]
	# [Update RS (ms): Min: 4.9, Avg: 4.9, Max: 5.0, Diff: 0.1, Sum: 9.8]
	# [Processed Buffers: Min: 2, Avg: 2.5, Max: 3, Diff: 1, Sum: 5]
	# [Scan RS (ms): Min: 6.8, Avg: 7.0, Max: 7.2, Diff: 0.3, Sum: 14.0]
	# [Code Root Scanning (ms): Min: 0.0, Avg: 0.7, Max: 1.3, Diff: 1.3, Sum: 1.4]
	# [Object Copy (ms): Min: 34.9, Avg: 35.5, Max: 36.1, Diff: 1.2, Sum: 70.9]
	# [Termination (ms): Min: 0.0, Avg: 0.0, Max: 0.0, Diff: 0.0, Sum: 0.0]
	# [Termination Attempts: Min: 1, Avg: 1.0, Max: 1, Diff: 0, Sum: 2]
	# [GC Worker Other (ms): Min: 0.0, Avg: 0.0, Max: 0.0, Diff: 0.0, Sum: 0.0]
	# [GC Worker Total (ms): Min: 50.5, Avg: 50.5, Max: 50.5, Diff: 0.0, Sum: 101.0]
	# [GC Worker End (ms): Min: 2695.5, Avg: 2695.5, Max: 2695.5, Diff: 0.0]
	######################
	/\[Parallel Time:.*]$/{
	split($0,pt," ");
	ParallelTime=pt[3];
	}
	/Ext Root Scanning/{
	gsub( ",\|]", " ");
	split($0,ers," ");
	ERS_Min=ers[6];
	ERS_Avg=ers[8];
	ERS_Max=ers[10];
	ERS_Sum=ers[14];
	}
	/Update RS \(ms\)\:/{
	gsub( ",\|]", " ");
	split($0,urs," ");
	URS_Min=urs[5];
	URS_Avg=urs[7];
	URS_Max=urs[9];
	URS_Sum=urs[13];
	}
	/Scan RS \(ms\)\:/{
	gsub( ",\|]", " ");
	split($0,srs," ");
	SRS_Min=srs[5];
	SRS_Avg=srs[7];
	SRS_Max=srs[9];
	SRS_Sum=srs[13];
	}
	/Code Root Scanning \(ms\)\:/{
	gsub( ",\|]", " ");
	split($0,crt," ");
	CRT_Min=crt[6];
	CRT_Avg=crt[8];
	CRT_Max=crt[10];
	CRT_Sum=crt[14];
	}
	/Object Copy \(ms\)\:/{
	gsub( ",\|]", " ");
	split($0,oc," ");
	GC_obj_copy_Min=oc[5];
	GC_obj_copy_Avg=oc[7];
	GC_obj_copy_Max=oc[9];
	GC_obj_copy_Sum=oc[13];
	}
	/Termination \(ms\)\:/{
	gsub( ",\|]", " ");
	split($0,term," ");
	GC_Termination_Min=term[4];
	GC_Termination_Avg=term[6];
	GC_Termination_Max=term[8];
	GC_Termination_Sum=term[12];
	}
	/GC Worker Other \(ms\)\:/{
	gsub( ",\|]", " ");
	split($0,oth," ");
	GC_worker_other_Min=oth[6];
	GC_worker_other_Avg=oth[8];
	GC_worker_other_Max=oth[10];
	GC_worker_other_Sum=oth[14];
	}
	/GC Worker Total \(ms\)\:/{
	gsub( ",\|]", " ");
	split($0,tot," ");
	GC_worker_total_Min=tot[6];
	GC_worker_total_Avg=tot[8];
	GC_worker_total_Max=tot[10];
	GC_worker_total_Sum=tot[14];
	}
	######################
	# Pull out time stamps that are in lines with this format:
	# 2014-05-12T14:02:06.025-0700: 94.312: [GC pause (young), 0.08870154 secs]
	######################
	/GC pause/ {
	DateTime=substr($1, 0, length($1)-1);
	split($2, a, ":")
	SecondsSinceLaunch=a[1]
	}

	######################
	# Heap sizes are in lines that look like this:
	# [ 4842M(9216M)->4838M(9216M)]
	######################
	/\[Eden:.*]$/ {
	gsub ( "\\[" , "" );
	gsub ( "\\]" , "" );
	gsub ( "->" , " " );
	gsub ( "\\(" , " " );
	gsub ( "\\)" , " " );
	gsub ( "Survivors:", "" );
	gsub ( "Heap:", "" );

	split($0,a," ");

	EdenBeforeSize=parse_size(a[2]);
	EdenBeforeCap=parse_size(a[3]);
	EdenAfterSize=parse_size(a[4]);
	EdenAfterCap=parse_size(a[5]);

	SurvBeforeSize=parse_size(a[6]);
	SurvAfterSize=parse_size(a[7]);

	HeapBeforeSize=parse_size(a[8]);
	HeapBeforeCap=parse_size(a[9]);
	HeapAfterSize=parse_size(a[10]);
	HeapAfterCap=parse_size(a[11]);
	}

	######################
	# Emit an output line when you find input that looks like this:
	# [Times: user=1.41 sys=0.08, real=0.24 secs]
	######################
	/\[Times/ {
	if (G1GC==1) {
	gsub ( "," , "" );
	split($2,a,"=");
	UserTime=a[2];
	split($3,a,"=");
	SysTime=a[2];
	split($4,a,"=");
	RealTime=a[2];
	print DateTime,SecondsSinceLaunch,IncrementalCount,UserTime,SysTime,RealTime,EdenBeforeSize,EdenAfterSize,SurvBeforeSize,SurvAfterSize,HeapBeforeSize,HeapAfterSize,ParallelTime,ERS_Avg,ERS_Sum,URS_Avg,URS_Sum,SRS_Avg,SRS_Sum,CRT_Avg,CRT_Sum,GC_obj_copy_Avg,GC_obj_copy_Sum,GC_Termination_Avg,GC_Termination_Sum,GC_worker_other_Avg,GC_worker_other_Sum,GC_worker_total_Avg,GC_worker_total_Sum,IPAddress;
	}
	}
	Before analysing, the recording of the gc log information should be enabled with the JVM arguments
	-XX:+PrintGCDetails
	-XX:+PrintGCDateStamps
	-XX:+PrintHeapAtGC
	-XX:+PrintTenuringDistribution
	-XX:+PrintGCApplicationStoppedTime
	-XX:+PrintPromotionFailure
	-XX:PrintFLSStatistics=1
	-Xloggc:/<valid path>/gc.log

	# The following are optional
	-XX:+UseGCLogFileRotation
	-XX:NumberOfGCLogFiles=10
	-XX:GCLogFileSize=10M


	There are 2 main components:
	1) g1_log_parser.sh which parses the logs (the path for the logs was defined with the -Xloggc JVM argument)
	example
	./g1_log_parser.sh /<valid path>/gc.log > data/gc_log_parsed

	2) Execute R scripts
	- R framework can be downloaded from https://www.r-project.org/;
	- it is suggested to use RStudio as an IDE for the scripts https://www.rstudio.com/products/rstudio/download/, optionally there are also plugins for IntelliJ (R4Intellij) and Eclipse (StatET) that are able to interpret and execute R scripts.
	- it is required to install ggplot2 executing on R
	install.packages('ggplot2')
	library(ggplot2)

	# Presents an histogram for the time required to complete a garbage collection action
	# Assumes that g1gc has been already initialized, as on g1_gc_analysis.R
	source('multiplot.R')

	border_color <- 'black'
	label_x <- 'Time (seconds)'
	label_y <- 'Frequency'

	# Real time is the total time taken to execute the garbage collection
	title_hist_1 = paste(variable_study,"Real Time")
	freq_hist_real <- ggplot(data=g1gc, aes(g1gc$RealTime)) +
	geom_histogram(col=border_color, aes(fill=..count..), bins = 15) +
	labs(x = label_x, y = label_y) +
	ggtitle(paste(title_hist_1, subtitle))


	# User time considers only the time when operations occurred on the User space
	title_hist_2 = paste(variable_study,"User Time")
	freq_hist_user <- ggplot(data=g1gc, aes(g1gc$UserTime)) +
	geom_histogram(col=border_color, aes(fill=..count..), bins = 15) +
	labs(x = label_x, y = label_y) +
	ggtitle(paste(title_hist_2, subtitle))

	# Sys (System time) considers the time when operations occurred on the system space (i.e.: kernel
	# operations)
	title_hist_3 = paste(variable_study,"Sys Time")
	freq_hist_sys <- ggplot(data=g1gc, aes(g1gc$SysTime)) +
	geom_histogram(col=border_color, aes(fill=..count..), bins = 15) +
	labs(x = label_x, y = label_y) +
	ggtitle(paste(title_hist_3, subtitle))


	multiplot(freq_hist_real, freq_hist_user, freq_hist_sys, cols = 1)
	# work based on
	# https://blogs.oracle.com/taylor22/entry/using_r_to_analyze_g1gc

	# For more information regarding G1
	# https://blogs.oracle.com/poonam/entry/understanding_g1_gc_logs

	# Initializes the variable that contains the data, and calls the different scripts that contains the
	# graphs to display

	# Required for compatibility with the R4IntelliJ plugin
	setwd('~/repo/db/tools/g1_analysis')
	datetime_format <- "%Y-%m-%dT%H:%M:%OS%z"

	# What is the variable that we are studying? This label will be appended on the title of all the
	# graphs obtained, this will help to make easier the research

	# Define the title that will appear on the plots, we usually set it to the environment and Cassandra cluster where the data was originated
	variable_study <- "<Environment> <stack> -"

	# Time range to apply for the test. Note that the times on date series will still be presented in UTC
	lower_date <- "2016-05-12T00:00:00.000-0400"
	upper_date <- "2016-06-18T00:00:00.000-0400"
	timezone <- "America/New_York"

	subtitle = paste("\n", lower_date, " - ", upper_date)

	# Update the path to the location of the output from the logger
	data_source <- "data/<input_parsed>.log"

	g1gc_all <- read.csv(data_source, row.names = NULL, stringsAsFactors=FALSE, sep=" ")

	g1gc <- subset(g1gc_all, as.POSIXlt(strptime(g1gc_all$Timestamp, datetime_format)) >= as.POSIXlt(strptime(lower_date, datetime_format)) &
	as.POSIXlt(strptime(g1gc_all$Timestamp, datetime_format)) <= as.POSIXlt(strptime(upper_date, datetime_format)))

	source('duration_histogram.R')
	source('memory_composition.R')
	source('gc_phases.R')
	library(ggplot2)

	# Presents the comparison of before and after the garbage collection is executed on the different
	# sections of the memory.
	# Assumes that g1gc has been already initialized, as on g1_gc_analysis.R
	# More information of the meaning of the values is available at
	# https://blogs.oracle.com/poonam/entry/understanding_g1_gc_logs
	# https://blogs.oracle.com/g1gc/entry/g1gc_logs_how_to_print
	source('multiplot.R')

	# Constants
	datetime_format <- "%Y-%m-%dT%H:%M:%S"

	timeline_comp_fields <- c('Timestamp', 'GCTotalAvg', 'ERSAvg', 'URSAvg', 'SRSAvg', 'CRTAvg',
	'GCObjCopyAvg', 'GCTerminationAvg', 'GCOtherAvg')
	pie_phases_fields <- c('ERSSum', 'URSSum', 'SRSSum', 'CRTSum', 'GCObjCopySum', 'GCTerminationSum',
	'GCOtherSum')
	pie_phases_desc <- c('External Root Scanning', 'Remembered Sets - Update',
	'Remembered Sets - Scan', 'Code Root Scanning', 'Objects Copy', 'Termination', 'Other')

	timeline_comp_values <- g1gc[timeline_comp_fields]

	grandtotal <- colSums(g1gc["GCTotalSum"])
	pie_phases_aggr_values <- colSums(g1gc[pie_phases_fields]) * 100 / grandtotal

	pie_phases_values <- data.frame(pie_phases_desc, pie_phases_aggr_values)
	colnames(pie_phases_values) <- c("name", "sum")

	# Models the distribution of GC phases across the studies time
	title_gc_1 <- paste(variable_study, "GC phases composition")

	comp <- ggplot() +
	geom_line(data = timeline_comp_values,
	aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
	y = timeline_comp_values$GCTotalAvg, color = "Total")) +
	geom_line(data = timeline_comp_values,
	aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
	y = timeline_comp_values$ERSAvg, color = "External Root Scanning")) +
	geom_line(data = timeline_comp_values,
	aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
	y = timeline_comp_values$URSAvg, color = "Remembered Sets - Update")) +
	geom_line(data = timeline_comp_values,
	aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
	y = timeline_comp_values$SRSAvg, color = "Remembered Sets - Scan")) +
	geom_line(data = timeline_comp_values,
	aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
	y = timeline_comp_values$CRTAvg, color = "Code Root Scanning")) +
	geom_line(data = timeline_comp_values,
	aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
	y = timeline_comp_values$GCObjCopyAvg, color = "Objects copy")) +
	geom_line(data = timeline_comp_values,
	aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
	y = timeline_comp_values$GCTerminationAvg, color = "Termination")) +
	geom_line(data = timeline_comp_values,
	aes(x=as.POSIXlt(strptime(timeline_comp_values$Timestamp, datetime_format)),
	y = timeline_comp_values$GCOtherAvg, color = "Other")) +
	labs(x = "Date time (UTC)", y = "Time (ms.)") +
	ggtitle(paste(title_gc_1, subtitle))


	# Pie chart to show the distribution of the time spent in the different phases (actions)
	# that are taken by Garbage Collection.
	title_gc_2 <- paste(variable_study, "GC phase composition")
	pie_phase <- ggplot(pie_phases_values, aes (x="", y=pie_phases_values$sum, fill=name)) +
	geom_bar(width = 1, stat = "identity", color = "black") +
	coord_polar(theta = "y") +
	labs(x = "", y = "Percentage (%)") +
	ggtitle(paste(title_gc_2, subtitle))

	multiplot(comp, pie_phase, cols = 1)
	library(ggplot2)

	# Presents the comparison of begore and after the garbage collection is executed on the different
	# sections of the memory.
	# Assumes that g1gc has been already initialized, as on g1_gc_analysis.R
	source('multiplot.R')

	# Constants
	datetime_format <- "%Y-%m-%dT%H:%M:%S"
	label_x <- 'Date time (UTC)'
	# Note that log_parser.h standardizes to GB
	label_y <- 'Size (GB)'

	# Heap is the total memory size used
	title_mem_1 = paste(variable_study,"Heap size")
	heap <- ggplot() +
	geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
	y = g1gc$HeapBeforeSize, color = "before")) +
	geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
	y = g1gc$HeapAfterSize, color = "after")) +
	labs(x = label_x, y = label_y) +
	ggtitle(paste(title_mem_1, subtitle))

	# Eden refers to the memory pool where most of the new objects are created. After a garbage
	# collection, it is expected that it will decrease (as objects are removed or promoted to
	# "survivor" state.
	title_mem_2 = paste(variable_study,"Eden size")
	eden <- ggplot() +
	geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
	y = g1gc$EdenBeforeSize, color = "before")) +
	geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
	y = g1gc$EdenAfterSize, color = "after")) +
	labs(x = label_x, y = label_y) +
	ggtitle(paste(title_mem_2, subtitle))

	# Survivor, as the name proposes, refers to the memory space where objects that survive gc
	# operations that were previously on Eden. This is expected to increase after a GC execution.
	title_mem_3 = paste(variable_study,"Survivor size")
	survivor <- ggplot() +
	geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
	y = g1gc$SurvivorBefore, color = "before")) +
	geom_line(data = g1gc, aes(x=as.POSIXlt(strptime(g1gc$Timestamp, datetime_format)),
	y = g1gc$SurvivorAfter, color = "after")) +
	labs(x = label_x, y = label_y) +
	ggtitle(paste(title_mem_3, subtitle))

	multiplot(heap, eden, survivor, cols = 1)
	# Allows to present multiple graphs on the same canvas. Please note that it won't
	# modify the logic of the plots received, but the size and proportions of the charts
	# will be updated to fit them (with their legend boxes if present). Also this may
	# depend on the resolution of the output device used. Most of the times
	# with the default value of 1 for the cols will be fine.
	# from http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/

	multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
	# Allows present on the canvas of the output device, different charts (plots) on
	# the same canvas.
	# @param ... plots to be included on the canvas
	# @param plotlist optionally, the plots can be defined as a data frame that contains
	# the graphs to show, this will be merged with the graphs received above
	# @param cols number of columns to used on the template
	# @param optionally the layout can be precalculated and applied, this option is useful
	# if themes will be used
	library(grid)

	# Make a list from the ... arguments and plotlist
	plots <- c(list(...), plotlist)

	numPlots = length(plots)

	# If layout is NULL, then use 'cols' to determine layout
	if (is.null(layout)) {
	# Make the panel
	# ncol: Number of columns of plots
	# nrow: Number of rows needed, calculated from # of cols
	layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
	ncol = cols, nrow = ceiling(numPlots/cols))
	}

	if (numPlots==1) {
	print(plots[[1]])
	} else {
	# Set up the page
	grid.newpage()
	pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

	# Make each plot, in the correct location
	for (i in 1:numPlots) {
	# Get the i,j matrix positions of the regions that contain this subplot
	matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

	print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
	layout.pos.col = matchidx$col))
	}
	}
	}