hliang/collectl_parsing.R

## collectl_parsing.R
library(zoo)
# library("lubridate")
collectl_fn = list.files("D:/tmp/abruijn/", patter=".*.p.txt", full.names = TRUE); collectl_fn


zcolall = list()
slurmout_all = list()
for (i in seq_along(collectl_fn)) {
  jobid = gsub(".*mon_(\\d+)-(cn\\d+).*.p.txt", "\\1", collectl_fn[i])
  nodeid = gsub(".*mon_(\\d+)-(cn\\d+).*.p.txt", "\\2", collectl_fn[i])
  dcol = read.table(collectl_fn[i])
  colnames(dcol) = unlist(strsplit("Date Time [CPU]User% [CPU]Nice% [CPU]Sys% [CPU]Wait% [CPU]Irq% [CPU]Soft% [CPU]Steal% [CPU]Idle% [CPU]Totl% [CPU]Guest% [CPU]GuestN% [CPU]Intrpt/sec [CPU]Ctx/sec [CPU]Proc/sec [CPU]ProcQue [CPU]ProcRun [CPU]L-Avg1 [CPU]L-Avg5 [CPU]L-Avg15 [CPU]RunTot [CPU]BlkTot [MEM]Tot [MEM]Used [MEM]Free [MEM]Shared [MEM]Buf [MEM]Cached [MEM]Slab [MEM]Map [MEM]Anon [MEM]AnonH [MEM]Commit [MEM]Locked [MEM]SwapTot [MEM]SwapUsed [MEM]SwapFree [MEM]SwapIn [MEM]SwapOut [MEM]Dirty [MEM]Clean [MEM]Laundry [MEM]Inactive [MEM]PageIn [MEM]PageOut [MEM]PageFaults [MEM]PageMajFaults [MEM]HugeTotal [MEM]HugeFree [MEM]HugeRsvd [MEM]SUnreclaim [NET]RxPktTot [NET]TxPktTot [NET]RxKBTot [NET]TxKBTot [NET]RxCmpTot [NET]RxMltTot [NET]TxCmpTot [NET]RxErrsTot [NET]TxErrsTot [DSK]ReadTot [DSK]WriteTot [DSK]OpsTot [DSK]ReadKBTot [DSK]WriteKBTot [DSK]KbTot [DSK]ReadMrgTot [DSK]WriteMrgTot [DSK]MrgTot", " "))
  head(dcol)
  dcol$datetime = paste(dcol$Date, dcol$Time)
  dcol$datetime = strptime(dcol$datetime, "%Y%m%d %H:%M:%S", tz = "Hongkong")
  #dcol$datetime = with_tz(dcol$datetime, tzone = "GMT")

  x = dcol
  x[, c("Date", "Time", "datetime")] = NULL
  zcol = zoo(x, order.by=dcol$datetime)

  zcolall[[jobid]] = zcol

  ## slurm-xxx.out
  slurmout_fn = list.files("D:/tmp/abruijn/", patter=paste0("slurm-", jobid, ".out"), full.names = TRUE); slurmout_fn
  if(length(slurmout_fn)==1) {
    library(stringr)
    op <- options(digits.secs=3)
    x = scan(slurmout_fn, what="", sep="\n")
    x = grep(pattern="INFO", x=x, value=TRUE)
    #x = data.frame(str_split_fixed(x, " ", 4), stringsAsFactors=FALSE)
    x = data.frame(str_split_fixed(x, "[\\[\\]] *", 3), stringsAsFactors=FALSE)
    x[, 1] = NULL
    colnames(x) = c("datetime", "event")
    x$datetime = paste0(x$datetime, ".", sprintf("%03d", seq_along(x$datetime))) # add milliseconds, to avoid duplicate timestamp
    tmp = strptime(x$datetime, "%Y-%m-%d %H:%M:%OS", tz = "Hongkong")
    x$datetime = tmp
    x$event = gsub(pattern="INFO: ", replacement="", x=x$event)
    zevent = zoo(x[, "event", drop=FALSE], order.by=x$datetime)
    options(op)
  } else {
    stop("Need one slurm-xxx.out, but found ", length(slurmout_fn), ":\n", slurmout_fn)
  }

  slurmout_all[[jobid]] = zevent

}

head(zcol)
head(zevent)


##########################################################
## plot CPU
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
plot.zoo(zcol[, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), ylab="CPU usage %", xlab="min", xaxt="n", type="s")
title(sub=paste0("job=", jobid, " node=", nodeid))
legend(x = "topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))

tick_interval = as.difftime("00:01:00") # 1 minutes
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); #tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )

tick_interval = as.difftime("00:05:00") # 5 minutes
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); #tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)


## plot memory
#colsToPlot = c("[MEM]Tot", "[MEM]Used", "[MEM]Free", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
plot.zoo(zcol[, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), ylab="Memory /GB", xlab="min", xaxt="n", type="s")
title(sub=paste0("job=", jobid, " node=", nodeid))
legend(x="topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))
# legend(x="top", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot), ncol=ceiling(length(colsToPlot)/2), inset=c(0.0,-0.12), xpd=TRUE)

tick_interval = as.difftime("00:01:00") # 1 minutes
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )

tick_interval = as.difftime("00:05:00") # 5 minutes
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)


##########################################################
## plot multiple (cpu and mem) in one figure
par(mar=c(5, 4, 5, 4)+0.1)
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
plot.zoo(zcol[, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), xaxt="n", yaxt="n", ann=FALSE, type="s")
axis(side=2)
mtext(text="CPU %", side=2, line=2)
legend(x = "topleft", legend = colsToPlot, bty = "n", lty = 1, col = seq_along(colsToPlot))

tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)

# tick_interval = as.difftime("01:00:00") # 1 hours
# tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
# axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )
#
# tick_interval = as.difftime("05:00:00") # 5 hours
# tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
# axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)

title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))

colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
par(new = TRUE)
plot.zoo(zcol[, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=2)
legend(x = "topright", legend = colsToPlot, bty = "n", lty = 2, col = seq_along(colsToPlot))
axis(side = 4)
mtext(text="Memory /GB", side=4, line=2)


##########################################################
## milestone marks
axis.POSIXct(time(zevent), side=3, at=time(zevent), labels=zevent$event, las=2, cex.axis=0.6)


##########################################################
## prefer simple plot
##########################################################
pdf("~/abruijn/abruijn_arabidopsis.pdf", width=10, height=7)
par(mar=c(5, 4, 5, 4)+0.1)
rowsToPlot = seq(1, nrow(zcol), by=12)
colsToPlot = c("[CPU]Totl%")
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
plot.zoo(zcol[rowsToPlot, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), ylab="CPU %", xlab="", xaxt="n", type="s")
title(sub=paste0("job=", jobid, " node=", nodeid))
legend(x = "topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))
#
# plot.zoo(zcol[rowsToPlot, colsToPlot], plot.type = "single", col="red", ylim=c(0, 100), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=1)
# axis(side=2)
# mtext(text="CPU %", side=2, line=2)
# legend(x = "topleft", legend = colsToPlot, bty = "n", lty = 1, col = "red")

tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
tick_interval = as.difftime("06:00:00") # hours
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)

title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))


colsToPlot = c("[MEM]Used")
colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
#par(new = TRUE)
plot.zoo(zcol[rowsToPlot, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=1)
legend(x = "topright", legend = colsToPlot, bty = "n", lty = 1, col = seq_along(colsToPlot))
axis(side = 4)
mtext(text="Memory /GB", side=4, line=2)

tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
tick_interval = as.difftime("06:00:00") # hours
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)
title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))

dev.off()

## pairsTrading.R

set.seed( 1) #设置随机种子
dates<- as.Date( '2010-01-01')+ 1: 100#100个日期
x<-round(rnorm( 100, 50, 40), 2) #随机生成X产品，100个正态分析的收盘价
y<-round(rnorm( 100, 50, 40), 2) #随机生成Y产品，100个正态分析的收盘价
df<-data.frame(dates,x,y)
df
library(ggplot2)
library(scales)
library(reshape2)
df2<-melt(df,c('dates'))
g<-ggplot(data=df2,aes(x=dates,y=value,colour=variable))
g<-g+geom_line()
g<-g+scale_x_date(date_breaks = "1 week",date_labels='%m-%d')
g<-g+labs(x='date',y='Price')
g
# 计算差价
df$diff<-df$x-df$y
# 找到差价大于10时的点
idx<-which(df$diff> 10)
idx<-idx[-which(diff(idx)== 1)- 1]
# 打印差价的索引值
idx
# 接下来，我们进行模拟交易，取第一个索引值的点，在2010-01-04时做空X，做多Y。当差价小于0在2010-01-06时，进行平仓。
# 打印前20个数据
head(df, 20)

# 当差价大于10时，做空X，当差价小于0时，平仓。
# 第4行做空，第6行平仓
xprofit<- df$x[4]-df$x[6]; xprofit

# 当差价大于10时，做多Y；当差价小于0时，平仓。
# 第4行做空，第6行平仓
yprofit<- df$y[6]-df$y[4]; yprofit

# 这是为什么呢？
# 根据配对交易的假设条件，如果两个金融产品的价差是收敛的，通过协整性检验的方法，我们可验证数据的收敛性。那么如果数据是收敛的，他还会具备均值回归的特性，请参考文章 均值回归，逆市中的投资机会。

# 画出X,Y的价差图，我们可以明显的看出，价差一直围绕着0上下波动，这是明显收敛的，同时符合均值回归的特性。
plot(df$diff, type= 'l')


x = data.frame(var01=c("十全大补丸[成分]甲、乙、丙", "含笑半步颠[成分]丁、戊、己"),
               var02=c("haha", "hehe"), stringsAsFactors=FALSE)

gsub("\\[.*", "", x$var01)


## rice_ID.R
# converting rice gene ids

library("biomaRt")
listMarts(host="plants.ensembl.org")
ensembl=useMart("plants_mart", host="plants.ensembl.org")
x = listDatasets(ensembl)
ensembl = useDataset("osativa_eg_gene", mart=ensembl)

att = listAttributes(ensembl)
att

ensIDs = c("OS01G0232100", "OS04G0691100", "OS02G0475400")
refseqPepIDs = c("XP_015633962.1", "XP_015635635.1", "XP_015623178.1", "XP_015614092.1", "XP_015614100.1")


myres1 = getBM(attributes=c('ensembl_gene_id', 'refseq_peptide'),
               filters = 'ensembl_gene_id',
               values = ensIDs,
               mart = ensembl)

myres2 = getBM(attributes=c('ensembl_gene_id', 'refseq_peptide'),
               filters = 'refseq_peptide',
               values = refseqPepIDs,
               mart = ensembl)

## userVisits.R
rm(list=ls())

##############################################################
## summarize how many visits does each user have each day

## simulate data
set.seed(1)
n = 800 # total number of requests in the data set
uid = sample(LETTERS[1:20], n, replace=TRUE) # user id
st = Sys.time(); st # pick a start time for the data set
et = st + as.difftime(10, units="days") # pick an end time for the data set
dt = et - st; #dt # overall duration of the dataset
ev = as.difftime(sort(runif(n, 0, dt)), units="days"); ev # generate random difftime
requesttime = st + ev; head(requesttime) # generate random start time for each request
requestDate = as.Date(requesttime, tz=Sys.timezone()); head(requestDate) # start date
df = data.frame(requestDate, uid, requesttime) # put them in one data frame
head(df); dim(df)
#df

## set cutoff. if the time interval of two consecutive request is less than cutoff, they are deemed the same visit
cutoff = as.difftime(1, units="hours")
## dataframe 'visits' summarize how many visits a user have on each day
visits = aggregate(df$requesttime, by=list(uid=df$uid, requestDate=df$requestDate), FUN=function(x){sum(diff(x) > cutoff) + 1}); head(visits)

#res
# plot(x~requestDate, data=res, type="l")
# plot
library(ggplot2)
(p1 <- ggplot(data = visits, aes(x = requestDate, y = x, colour = uid)) + geom_line() + geom_point())

## top frequent users
# mean2 = mean(visits$x) * 1.2; mean2
totalVisitsPerUser = aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){sum(x)})
# top 10% users
totalVisitsPerUser = totalVisitsPerUser[order(totalVisitsPerUser$x, decreasing=TRUE), ]
topUsers = totalVisitsPerUser[1:floor(nrow(totalVisitsPerUser) * 0.10), ]
# aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){mean(x)})
# aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){mean(x)>mean2})

(p1 <- ggplot(data = visits[visits$uid==topUsers$uid, ], aes(x = requestDate, y = x, colour = uid)) + geom_line() + geom_point())


######################################################
last_raw = read.table("F:/last/last_ln0", header=FALSE, stringsAsFactors=FALSE, nrows=13794)
last_raw$time = paste(last_raw$V4, last_raw$V5, last_raw$V6, last_raw$V7); #last_raw$time
last_raw$time = strptime(last_raw$time, format="%a %b %e %R")
last_raw$date = as.Date(last_raw$time, tz=Sys.timezone())

last = data.frame(time=last_raw$time, date=last_raw$date, user=last_raw$V1, tty=last_raw$V2, remote=last_raw$V3)
#last = last[rev(seq_len(nrow(last))), ]
last = last[order(last$time), ]
head(last); tail(last); dim(last)
## set cutoff. if the time interval of two consecutive request is less than cutoff, they are deemed the same visit
cutoff = as.difftime(30, units="mins")
## dataframe 'visits' summarize how many visits a user have on each day
visits = aggregate(last$time, by=list(uid=last$user, date=last$date), FUN=function(x){sum(abs(diff(x)) > cutoff) + 1}); head(visits)

plot(x~date, data=visits)
	library(zoo)
	# library("lubridate")
	collectl_fn = list.files("D:/tmp/abruijn/", patter=".*.p.txt", full.names = TRUE); collectl_fn


	zcolall = list()
	slurmout_all = list()
	for (i in seq_along(collectl_fn)) {
	jobid = gsub(".mon_(\\d+)-(cn\\d+)..p.txt", "\\1", collectl_fn[i])
	nodeid = gsub(".mon_(\\d+)-(cn\\d+)..p.txt", "\\2", collectl_fn[i])
	dcol = read.table(collectl_fn[i])
	colnames(dcol) = unlist(strsplit("Date Time [CPU]User% [CPU]Nice% [CPU]Sys% [CPU]Wait% [CPU]Irq% [CPU]Soft% [CPU]Steal% [CPU]Idle% [CPU]Totl% [CPU]Guest% [CPU]GuestN% [CPU]Intrpt/sec [CPU]Ctx/sec [CPU]Proc/sec [CPU]ProcQue [CPU]ProcRun [CPU]L-Avg1 [CPU]L-Avg5 [CPU]L-Avg15 [CPU]RunTot [CPU]BlkTot [MEM]Tot [MEM]Used [MEM]Free [MEM]Shared [MEM]Buf [MEM]Cached [MEM]Slab [MEM]Map [MEM]Anon [MEM]AnonH [MEM]Commit [MEM]Locked [MEM]SwapTot [MEM]SwapUsed [MEM]SwapFree [MEM]SwapIn [MEM]SwapOut [MEM]Dirty [MEM]Clean [MEM]Laundry [MEM]Inactive [MEM]PageIn [MEM]PageOut [MEM]PageFaults [MEM]PageMajFaults [MEM]HugeTotal [MEM]HugeFree [MEM]HugeRsvd [MEM]SUnreclaim [NET]RxPktTot [NET]TxPktTot [NET]RxKBTot [NET]TxKBTot [NET]RxCmpTot [NET]RxMltTot [NET]TxCmpTot [NET]RxErrsTot [NET]TxErrsTot [DSK]ReadTot [DSK]WriteTot [DSK]OpsTot [DSK]ReadKBTot [DSK]WriteKBTot [DSK]KbTot [DSK]ReadMrgTot [DSK]WriteMrgTot [DSK]MrgTot", " "))
	head(dcol)
	dcol$datetime = paste(dcol$Date, dcol$Time)
	dcol$datetime = strptime(dcol$datetime, "%Y%m%d %H:%M:%S", tz = "Hongkong")
	#dcol$datetime = with_tz(dcol$datetime, tzone = "GMT")

	x = dcol
	x[, c("Date", "Time", "datetime")] = NULL
	zcol = zoo(x, order.by=dcol$datetime)

	zcolall[[jobid]] = zcol

	## slurm-xxx.out
	slurmout_fn = list.files("D:/tmp/abruijn/", patter=paste0("slurm-", jobid, ".out"), full.names = TRUE); slurmout_fn
	if(length(slurmout_fn)==1) {
	library(stringr)
	op <- options(digits.secs=3)
	x = scan(slurmout_fn, what="", sep="\n")
	x = grep(pattern="INFO", x=x, value=TRUE)
	#x = data.frame(str_split_fixed(x, " ", 4), stringsAsFactors=FALSE)
	x = data.frame(str_split_fixed(x, "[\\[\\]] *", 3), stringsAsFactors=FALSE)
	x[, 1] = NULL
	colnames(x) = c("datetime", "event")
	x$datetime = paste0(x$datetime, ".", sprintf("%03d", seq_along(x$datetime))) # add milliseconds, to avoid duplicate timestamp
	tmp = strptime(x$datetime, "%Y-%m-%d %H:%M:%OS", tz = "Hongkong")
	x$datetime = tmp
	x$event = gsub(pattern="INFO: ", replacement="", x=x$event)
	zevent = zoo(x[, "event", drop=FALSE], order.by=x$datetime)
	options(op)
	} else {
	stop("Need one slurm-xxx.out, but found ", length(slurmout_fn), ":\n", slurmout_fn)
	}

	slurmout_all[[jobid]] = zevent

	}

	head(zcol)
	head(zevent)


	##########################################################
	## plot CPU
	colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
	plot.zoo(zcol[, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), ylab="CPU usage %", xlab="min", xaxt="n", type="s")
	title(sub=paste0("job=", jobid, " node=", nodeid))
	legend(x = "topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))

	tick_interval = as.difftime("00:01:00") # 1 minutes
	tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); #tick_pos
	axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )

	tick_interval = as.difftime("00:05:00") # 5 minutes
	tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); #tick_pos
	axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)


	## plot memory
	#colsToPlot = c("[MEM]Tot", "[MEM]Used", "[MEM]Free", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
	colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
	plot.zoo(zcol[, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), ylab="Memory /GB", xlab="min", xaxt="n", type="s")
	title(sub=paste0("job=", jobid, " node=", nodeid))
	legend(x="topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))
	# legend(x="top", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot), ncol=ceiling(length(colsToPlot)/2), inset=c(0.0,-0.12), xpd=TRUE)

	tick_interval = as.difftime("00:01:00") # 1 minutes
	tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
	axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )

	tick_interval = as.difftime("00:05:00") # 5 minutes
	tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
	axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)



	##########################################################
	## plot multiple (cpu and mem) in one figure
	par(mar=c(5, 4, 5, 4)+0.1)
	colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
	plot.zoo(zcol[, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), xaxt="n", yaxt="n", ann=FALSE, type="s")
	axis(side=2)
	mtext(text="CPU %", side=2, line=2)
	legend(x = "topleft", legend = colsToPlot, bty = "n", lty = 1, col = seq_along(colsToPlot))

	tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
	tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
	axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)

	# tick_interval = as.difftime("01:00:00") # 1 hours
	# tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
	# axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )
	#
	# tick_interval = as.difftime("05:00:00") # 5 hours
	# tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
	# axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)

	title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))

	colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
	par(new = TRUE)
	plot.zoo(zcol[, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=2)
	legend(x = "topright", legend = colsToPlot, bty = "n", lty = 2, col = seq_along(colsToPlot))
	axis(side = 4)
	mtext(text="Memory /GB", side=4, line=2)


	##########################################################
	## milestone marks
	axis.POSIXct(time(zevent), side=3, at=time(zevent), labels=zevent$event, las=2, cex.axis=0.6)



	##########################################################
	## prefer simple plot
	##########################################################
	pdf("~/abruijn/abruijn_arabidopsis.pdf", width=10, height=7)
	par(mar=c(5, 4, 5, 4)+0.1)
	rowsToPlot = seq(1, nrow(zcol), by=12)
	colsToPlot = c("[CPU]Totl%")
	colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
	plot.zoo(zcol[rowsToPlot, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), ylab="CPU %", xlab="", xaxt="n", type="s")
	title(sub=paste0("job=", jobid, " node=", nodeid))
	legend(x = "topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))
	#
	# plot.zoo(zcol[rowsToPlot, colsToPlot], plot.type = "single", col="red", ylim=c(0, 100), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=1)
	# axis(side=2)
	# mtext(text="CPU %", side=2, line=2)
	# legend(x = "topleft", legend = colsToPlot, bty = "n", lty = 1, col = "red")

	tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
	tick_interval = as.difftime("06:00:00") # hours
	tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
	axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)

	title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))


	colsToPlot = c("[MEM]Used")
	colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
	#par(new = TRUE)
	plot.zoo(zcol[rowsToPlot, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=1)
	legend(x = "topright", legend = colsToPlot, bty = "n", lty = 1, col = seq_along(colsToPlot))
	axis(side = 4)
	mtext(text="Memory /GB", side=4, line=2)

	tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
	tick_interval = as.difftime("06:00:00") # hours
	tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
	axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)
	title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))

	dev.off()

	set.seed( 1) #设置随机种子
	dates<- as.Date( '2010-01-01')+ 1: 100#100个日期
	x<-round(rnorm( 100, 50, 40), 2) #随机生成X产品，100个正态分析的收盘价
	y<-round(rnorm( 100, 50, 40), 2) #随机生成Y产品，100个正态分析的收盘价
	df<-data.frame(dates,x,y)
	df
	library(ggplot2)
	library(scales)
	library(reshape2)
	df2<-melt(df,c('dates'))
	g<-ggplot(data=df2,aes(x=dates,y=value,colour=variable))
	g<-g+geom_line()
	g<-g+scale_x_date(date_breaks = "1 week",date_labels='%m-%d')
	g<-g+labs(x='date',y='Price')
	g
	# 计算差价
	df$diff<-df$x-df$y
	# 找到差价大于10时的点
	idx<-which(df$diff> 10)
	idx<-idx[-which(diff(idx)== 1)- 1]
	# 打印差价的索引值
	idx
	# 接下来，我们进行模拟交易，取第一个索引值的点，在2010-01-04时做空X，做多Y。当差价小于0在2010-01-06时，进行平仓。
	# 打印前20个数据
	head(df, 20)

	# 当差价大于10时，做空X，当差价小于0时，平仓。
	# 第4行做空，第6行平仓
	xprofit<- df$x[4]-df$x[6]; xprofit

	# 当差价大于10时，做多Y；当差价小于0时，平仓。
	# 第4行做空，第6行平仓
	yprofit<- df$y[6]-df$y[4]; yprofit

	# 这是为什么呢？
	# 根据配对交易的假设条件，如果两个金融产品的价差是收敛的，通过协整性检验的方法，我们可验证数据的收敛性。那么如果数据是收敛的，他还会具备均值回归的特性，请参考文章均值回归，逆市中的投资机会。

	# 画出X,Y的价差图，我们可以明显的看出，价差一直围绕着0上下波动，这是明显收敛的，同时符合均值回归的特性。
	plot(df$diff, type= 'l')


	x = data.frame(var01=c("十全大补丸[成分]甲、乙、丙", "含笑半步颠[成分]丁、戊、己"),
	var02=c("haha", "hehe"), stringsAsFactors=FALSE)

	gsub("\\[.*", "", x$var01)
	# converting rice gene ids

	library("biomaRt")
	listMarts(host="plants.ensembl.org")
	ensembl=useMart("plants_mart", host="plants.ensembl.org")
	x = listDatasets(ensembl)
	ensembl = useDataset("osativa_eg_gene", mart=ensembl)

	att = listAttributes(ensembl)
	att

	ensIDs = c("OS01G0232100", "OS04G0691100", "OS02G0475400")
	refseqPepIDs = c("XP_015633962.1", "XP_015635635.1", "XP_015623178.1", "XP_015614092.1", "XP_015614100.1")


	myres1 = getBM(attributes=c('ensembl_gene_id', 'refseq_peptide'),
	filters = 'ensembl_gene_id',
	values = ensIDs,
	mart = ensembl)

	myres2 = getBM(attributes=c('ensembl_gene_id', 'refseq_peptide'),
	filters = 'refseq_peptide',
	values = refseqPepIDs,
	mart = ensembl)
	rm(list=ls())

	##############################################################
	## summarize how many visits does each user have each day

	## simulate data
	set.seed(1)
	n = 800 # total number of requests in the data set
	uid = sample(LETTERS[1:20], n, replace=TRUE) # user id
	st = Sys.time(); st # pick a start time for the data set
	et = st + as.difftime(10, units="days") # pick an end time for the data set
	dt = et - st; #dt # overall duration of the dataset
	ev = as.difftime(sort(runif(n, 0, dt)), units="days"); ev # generate random difftime
	requesttime = st + ev; head(requesttime) # generate random start time for each request
	requestDate = as.Date(requesttime, tz=Sys.timezone()); head(requestDate) # start date
	df = data.frame(requestDate, uid, requesttime) # put them in one data frame
	head(df); dim(df)
	#df

	## set cutoff. if the time interval of two consecutive request is less than cutoff, they are deemed the same visit
	cutoff = as.difftime(1, units="hours")
	## dataframe 'visits' summarize how many visits a user have on each day
	visits = aggregate(df$requesttime, by=list(uid=df$uid, requestDate=df$requestDate), FUN=function(x){sum(diff(x) > cutoff) + 1}); head(visits)

	#res
	# plot(x~requestDate, data=res, type="l")
	# plot
	library(ggplot2)
	(p1 <- ggplot(data = visits, aes(x = requestDate, y = x, colour = uid)) + geom_line() + geom_point())

	## top frequent users
	# mean2 = mean(visits$x) * 1.2; mean2
	totalVisitsPerUser = aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){sum(x)})
	# top 10% users
	totalVisitsPerUser = totalVisitsPerUser[order(totalVisitsPerUser$x, decreasing=TRUE), ]
	topUsers = totalVisitsPerUser[1:floor(nrow(totalVisitsPerUser) * 0.10), ]
	# aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){mean(x)})
	# aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){mean(x)>mean2})

	(p1 <- ggplot(data = visits[visits$uid==topUsers$uid, ], aes(x = requestDate, y = x, colour = uid)) + geom_line() + geom_point())



	######################################################
	last_raw = read.table("F:/last/last_ln0", header=FALSE, stringsAsFactors=FALSE, nrows=13794)
	last_raw$time = paste(last_raw$V4, last_raw$V5, last_raw$V6, last_raw$V7); #last_raw$time
	last_raw$time = strptime(last_raw$time, format="%a %b %e %R")
	last_raw$date = as.Date(last_raw$time, tz=Sys.timezone())

	last = data.frame(time=last_raw$time, date=last_raw$date, user=last_raw$V1, tty=last_raw$V2, remote=last_raw$V3)
	#last = last[rev(seq_len(nrow(last))), ]
	last = last[order(last$time), ]
	head(last); tail(last); dim(last)
	## set cutoff. if the time interval of two consecutive request is less than cutoff, they are deemed the same visit
	cutoff = as.difftime(30, units="mins")
	## dataframe 'visits' summarize how many visits a user have on each day
	visits = aggregate(last$time, by=list(uid=last$user, date=last$date), FUN=function(x){sum(abs(diff(x)) > cutoff) + 1}); head(visits)

	plot(x~date, data=visits)