Skip to content

Instantly share code, notes, and snippets.

@hliang
Last active June 6, 2018 02:12
Show Gist options
  • Save hliang/d2aa8b6e07d136492c81b452cc02b795 to your computer and use it in GitHub Desktop.
Save hliang/d2aa8b6e07d136492c81b452cc02b795 to your computer and use it in GitHub Desktop.
misc R scripts
library(zoo)
# library("lubridate")
collectl_fn = list.files("D:/tmp/abruijn/", patter=".*.p.txt", full.names = TRUE); collectl_fn
zcolall = list()
slurmout_all = list()
for (i in seq_along(collectl_fn)) {
jobid = gsub(".*mon_(\\d+)-(cn\\d+).*.p.txt", "\\1", collectl_fn[i])
nodeid = gsub(".*mon_(\\d+)-(cn\\d+).*.p.txt", "\\2", collectl_fn[i])
dcol = read.table(collectl_fn[i])
colnames(dcol) = unlist(strsplit("Date Time [CPU]User% [CPU]Nice% [CPU]Sys% [CPU]Wait% [CPU]Irq% [CPU]Soft% [CPU]Steal% [CPU]Idle% [CPU]Totl% [CPU]Guest% [CPU]GuestN% [CPU]Intrpt/sec [CPU]Ctx/sec [CPU]Proc/sec [CPU]ProcQue [CPU]ProcRun [CPU]L-Avg1 [CPU]L-Avg5 [CPU]L-Avg15 [CPU]RunTot [CPU]BlkTot [MEM]Tot [MEM]Used [MEM]Free [MEM]Shared [MEM]Buf [MEM]Cached [MEM]Slab [MEM]Map [MEM]Anon [MEM]AnonH [MEM]Commit [MEM]Locked [MEM]SwapTot [MEM]SwapUsed [MEM]SwapFree [MEM]SwapIn [MEM]SwapOut [MEM]Dirty [MEM]Clean [MEM]Laundry [MEM]Inactive [MEM]PageIn [MEM]PageOut [MEM]PageFaults [MEM]PageMajFaults [MEM]HugeTotal [MEM]HugeFree [MEM]HugeRsvd [MEM]SUnreclaim [NET]RxPktTot [NET]TxPktTot [NET]RxKBTot [NET]TxKBTot [NET]RxCmpTot [NET]RxMltTot [NET]TxCmpTot [NET]RxErrsTot [NET]TxErrsTot [DSK]ReadTot [DSK]WriteTot [DSK]OpsTot [DSK]ReadKBTot [DSK]WriteKBTot [DSK]KbTot [DSK]ReadMrgTot [DSK]WriteMrgTot [DSK]MrgTot", " "))
head(dcol)
dcol$datetime = paste(dcol$Date, dcol$Time)
dcol$datetime = strptime(dcol$datetime, "%Y%m%d %H:%M:%S", tz = "Hongkong")
#dcol$datetime = with_tz(dcol$datetime, tzone = "GMT")
x = dcol
x[, c("Date", "Time", "datetime")] = NULL
zcol = zoo(x, order.by=dcol$datetime)
zcolall[[jobid]] = zcol
## slurm-xxx.out
slurmout_fn = list.files("D:/tmp/abruijn/", patter=paste0("slurm-", jobid, ".out"), full.names = TRUE); slurmout_fn
if(length(slurmout_fn)==1) {
library(stringr)
op <- options(digits.secs=3)
x = scan(slurmout_fn, what="", sep="\n")
x = grep(pattern="INFO", x=x, value=TRUE)
#x = data.frame(str_split_fixed(x, " ", 4), stringsAsFactors=FALSE)
x = data.frame(str_split_fixed(x, "[\\[\\]] *", 3), stringsAsFactors=FALSE)
x[, 1] = NULL
colnames(x) = c("datetime", "event")
x$datetime = paste0(x$datetime, ".", sprintf("%03d", seq_along(x$datetime))) # add milliseconds, to avoid duplicate timestamp
tmp = strptime(x$datetime, "%Y-%m-%d %H:%M:%OS", tz = "Hongkong")
x$datetime = tmp
x$event = gsub(pattern="INFO: ", replacement="", x=x$event)
zevent = zoo(x[, "event", drop=FALSE], order.by=x$datetime)
options(op)
} else {
stop("Need one slurm-xxx.out, but found ", length(slurmout_fn), ":\n", slurmout_fn)
}
slurmout_all[[jobid]] = zevent
}
head(zcol)
head(zevent)
##########################################################
## plot CPU
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
plot.zoo(zcol[, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), ylab="CPU usage %", xlab="min", xaxt="n", type="s")
title(sub=paste0("job=", jobid, " node=", nodeid))
legend(x = "topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))
tick_interval = as.difftime("00:01:00") # 1 minutes
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); #tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )
tick_interval = as.difftime("00:05:00") # 5 minutes
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); #tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)
## plot memory
#colsToPlot = c("[MEM]Tot", "[MEM]Used", "[MEM]Free", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
plot.zoo(zcol[, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), ylab="Memory /GB", xlab="min", xaxt="n", type="s")
title(sub=paste0("job=", jobid, " node=", nodeid))
legend(x="topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))
# legend(x="top", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot), ncol=ceiling(length(colsToPlot)/2), inset=c(0.0,-0.12), xpd=TRUE)
tick_interval = as.difftime("00:01:00") # 1 minutes
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )
tick_interval = as.difftime("00:05:00") # 5 minutes
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)
##########################################################
## plot multiple (cpu and mem) in one figure
par(mar=c(5, 4, 5, 4)+0.1)
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
plot.zoo(zcol[, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), xaxt="n", yaxt="n", ann=FALSE, type="s")
axis(side=2)
mtext(text="CPU %", side=2, line=2)
legend(x = "topleft", legend = colsToPlot, bty = "n", lty = 1, col = seq_along(colsToPlot))
tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)
# tick_interval = as.difftime("01:00:00") # 1 hours
# tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
# axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 )
#
# tick_interval = as.difftime("05:00:00") # 5 hours
# tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
# axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)
title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))
colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
par(new = TRUE)
plot.zoo(zcol[, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=2)
legend(x = "topright", legend = colsToPlot, bty = "n", lty = 2, col = seq_along(colsToPlot))
axis(side = 4)
mtext(text="Memory /GB", side=4, line=2)
##########################################################
## milestone marks
axis.POSIXct(time(zevent), side=3, at=time(zevent), labels=zevent$event, las=2, cex.axis=0.6)
##########################################################
## prefer simple plot
##########################################################
pdf("~/abruijn/abruijn_arabidopsis.pdf", width=10, height=7)
par(mar=c(5, 4, 5, 4)+0.1)
rowsToPlot = seq(1, nrow(zcol), by=12)
colsToPlot = c("[CPU]Totl%")
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%")
plot.zoo(zcol[rowsToPlot, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), ylab="CPU %", xlab="", xaxt="n", type="s")
title(sub=paste0("job=", jobid, " node=", nodeid))
legend(x = "topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot))
#
# plot.zoo(zcol[rowsToPlot, colsToPlot], plot.type = "single", col="red", ylim=c(0, 100), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=1)
# axis(side=2)
# mtext(text="CPU %", side=2, line=2)
# legend(x = "topleft", legend = colsToPlot, bty = "n", lty = 1, col = "red")
tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
tick_interval = as.difftime("06:00:00") # hours
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)
title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))
colsToPlot = c("[MEM]Used")
colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached")
#par(new = TRUE)
plot.zoo(zcol[rowsToPlot, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=1)
legend(x = "topright", legend = colsToPlot, bty = "n", lty = 1, col = seq_along(colsToPlot))
axis(side = 4)
mtext(text="Memory /GB", side=4, line=2)
tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units"))
tick_interval = as.difftime("06:00:00") # hours
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5)
title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units")))
dev.off()
set.seed( 1) #设置随机种子
dates<- as.Date( '2010-01-01')+ 1: 100#100个日期
x<-round(rnorm( 100, 50, 40), 2) #随机生成X产品,100个正态分析的收盘价
y<-round(rnorm( 100, 50, 40), 2) #随机生成Y产品,100个正态分析的收盘价
df<-data.frame(dates,x,y)
df
library(ggplot2)
library(scales)
library(reshape2)
df2<-melt(df,c('dates'))
g<-ggplot(data=df2,aes(x=dates,y=value,colour=variable))
g<-g+geom_line()
g<-g+scale_x_date(date_breaks = "1 week",date_labels='%m-%d')
g<-g+labs(x='date',y='Price')
g
# 计算差价
df$diff<-df$x-df$y
# 找到差价大于10时的点
idx<-which(df$diff> 10)
idx<-idx[-which(diff(idx)== 1)- 1]
# 打印差价的索引值
idx
# 接下来,我们进行模拟交易,取第一个索引值的点,在2010-01-04时做空X,做多Y。当差价小于0在2010-01-06时,进行平仓。
# 打印前20个数据
head(df, 20)
# 当差价大于10时,做空X,当差价小于0时,平仓。
# 第4行做空,第6行平仓
xprofit<- df$x[4]-df$x[6]; xprofit
# 当差价大于10时,做多Y;当差价小于0时,平仓。
# 第4行做空,第6行平仓
yprofit<- df$y[6]-df$y[4]; yprofit
# 这是为什么呢?
# 根据配对交易的假设条件,如果两个金融产品的价差是收敛的,通过协整性检验的方法,我们可验证数据的收敛性。那么如果数据是收敛的,他还会具备均值回归的特性,请参考文章 均值回归,逆市中的投资机会。
# 画出X,Y的价差图,我们可以明显的看出,价差一直围绕着0上下波动,这是明显收敛的,同时符合均值回归的特性。
plot(df$diff, type= 'l')
x = data.frame(var01=c("十全大补丸[成分]甲、乙、丙", "含笑半步颠[成分]丁、戊、己"),
var02=c("haha", "hehe"), stringsAsFactors=FALSE)
gsub("\\[.*", "", x$var01)
# converting rice gene ids
library("biomaRt")
listMarts(host="plants.ensembl.org")
ensembl=useMart("plants_mart", host="plants.ensembl.org")
x = listDatasets(ensembl)
ensembl = useDataset("osativa_eg_gene", mart=ensembl)
att = listAttributes(ensembl)
att
ensIDs = c("OS01G0232100", "OS04G0691100", "OS02G0475400")
refseqPepIDs = c("XP_015633962.1", "XP_015635635.1", "XP_015623178.1", "XP_015614092.1", "XP_015614100.1")
myres1 = getBM(attributes=c('ensembl_gene_id', 'refseq_peptide'),
filters = 'ensembl_gene_id',
values = ensIDs,
mart = ensembl)
myres2 = getBM(attributes=c('ensembl_gene_id', 'refseq_peptide'),
filters = 'refseq_peptide',
values = refseqPepIDs,
mart = ensembl)
rm(list=ls())
##############################################################
## summarize how many visits does each user have each day
## simulate data
set.seed(1)
n = 800 # total number of requests in the data set
uid = sample(LETTERS[1:20], n, replace=TRUE) # user id
st = Sys.time(); st # pick a start time for the data set
et = st + as.difftime(10, units="days") # pick an end time for the data set
dt = et - st; #dt # overall duration of the dataset
ev = as.difftime(sort(runif(n, 0, dt)), units="days"); ev # generate random difftime
requesttime = st + ev; head(requesttime) # generate random start time for each request
requestDate = as.Date(requesttime, tz=Sys.timezone()); head(requestDate) # start date
df = data.frame(requestDate, uid, requesttime) # put them in one data frame
head(df); dim(df)
#df
## set cutoff. if the time interval of two consecutive request is less than cutoff, they are deemed the same visit
cutoff = as.difftime(1, units="hours")
## dataframe 'visits' summarize how many visits a user have on each day
visits = aggregate(df$requesttime, by=list(uid=df$uid, requestDate=df$requestDate), FUN=function(x){sum(diff(x) > cutoff) + 1}); head(visits)
#res
# plot(x~requestDate, data=res, type="l")
# plot
library(ggplot2)
(p1 <- ggplot(data = visits, aes(x = requestDate, y = x, colour = uid)) + geom_line() + geom_point())
## top frequent users
# mean2 = mean(visits$x) * 1.2; mean2
totalVisitsPerUser = aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){sum(x)})
# top 10% users
totalVisitsPerUser = totalVisitsPerUser[order(totalVisitsPerUser$x, decreasing=TRUE), ]
topUsers = totalVisitsPerUser[1:floor(nrow(totalVisitsPerUser) * 0.10), ]
# aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){mean(x)})
# aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){mean(x)>mean2})
(p1 <- ggplot(data = visits[visits$uid==topUsers$uid, ], aes(x = requestDate, y = x, colour = uid)) + geom_line() + geom_point())
######################################################
last_raw = read.table("F:/last/last_ln0", header=FALSE, stringsAsFactors=FALSE, nrows=13794)
last_raw$time = paste(last_raw$V4, last_raw$V5, last_raw$V6, last_raw$V7); #last_raw$time
last_raw$time = strptime(last_raw$time, format="%a %b %e %R")
last_raw$date = as.Date(last_raw$time, tz=Sys.timezone())
last = data.frame(time=last_raw$time, date=last_raw$date, user=last_raw$V1, tty=last_raw$V2, remote=last_raw$V3)
#last = last[rev(seq_len(nrow(last))), ]
last = last[order(last$time), ]
head(last); tail(last); dim(last)
## set cutoff. if the time interval of two consecutive request is less than cutoff, they are deemed the same visit
cutoff = as.difftime(30, units="mins")
## dataframe 'visits' summarize how many visits a user have on each day
visits = aggregate(last$time, by=list(uid=last$user, date=last$date), FUN=function(x){sum(abs(diff(x)) > cutoff) + 1}); head(visits)
plot(x~date, data=visits)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment