hakyim/miscel.R

## miscel.R
## loglik test
logLik.test = function(fit0,fit)
{
  l0 = logLik(fit0)
  l1 = logLik(fit)
  lratio = abs(as.numeric(l0)-as.numeric(l1))
  df = abs( attr(l1,'df') - attr(l0,'df') )
  pval = 1-pchisq( 2*lratio, df)
  return(pval)
}


## inverse normalization
invnorm = function(x) {
  if(is.null(dim(x))) res = invnorm.vector(x) else
  res=apply(x,2,invnorm.vector)
  res
}
invnorm.vector = function(x) {yy = rank(x)/(length(x)+1); qnorm(yy)}

## impute with column means
impute.col.means = function(df)
{
  meany = apply(df,2,mean,na.rm=TRUE)
  for(col in 1:ncol(df)) df[,col][is.na(df[,col])] = meany[col]
  df
}

## drop repeated vars but keeping first row for given var
## may be better to use top_n()
## example: widetempo %>% group_by(gene_id) %>% top_n(1,rank1)
## but when there are ties, top_n() will take all rows rather than 1
droprep = function(df,repvar,sortvar,decreasing=FALSE)
{
  if(repvar %in% names(df))
  {
    if(sortvar %in% names(df))
    {
      tab = table(df[[repvar]])
      nametab = names(tab)
      uniquelist = nametab[tab==1]
      replist =  nametab[tab>1]
      ind = df[[repvar]] %in% uniquelist
      res = df[ind,]
      tempo = df[!ind,]
      tempo = tempo[order(tempo[[sortvar]],decreasing=decreasing),]
      for(vv in replist)
      {
        rowa = tempo[tempo[[repvar]]==vv,][1,]
        res = rbind(res,rowa)
      }
      return(res)
    } else {warning(sortvar %&% ' (sortvar) not in data frame')}
  } else {warning(repvar %&% ' (repvar) not in data frame')}
}

## HLA genes
## HLA Diversity in the 1000 Genomes Dataset
## http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0097282
## Table S1: MHC region definitions
## MHC Sub-region    Number of variants    % of  variants    SNPs    Gene    Position
## From     to    From    to    From    To
## Ext_CLASS_I    11,180    11%    rs3130838    rs1122947    TRIM27    MOG    28,866,528    29,638,434
## CLASS_I    42,718    42%    rs375984    rs3131630    ZFP57    MICB    29,644,502    31,485,354
## CLASS_III    8,033    8%    rs4959079    rs416352    MCCD1    NOTCH4    31,488,879    32,207,393
## CLASS_II    29,906    29%    rs424232    rs3129223    AK123889    HLA-DPB2    32,208,324    33,113,197
## Ext_CLASS_II    9,403    10%    rs1003979    rs1547668    COL11A2    MLN    33,114,171    33,775,446
## Total    103,310    100%    DB snp    UCSC    UCSC HG19/Build37
# hla.genelist = geneannot %>% filter(chr=='6') %>% filter(start > 28866528 & end < 33775446) %>% .[["ensgene"]]
	## loglik test
	logLik.test = function(fit0,fit)
	{
	l0 = logLik(fit0)
	l1 = logLik(fit)
	lratio = abs(as.numeric(l0)-as.numeric(l1))
	df = abs( attr(l1,'df') - attr(l0,'df') )
	pval = 1-pchisq( 2*lratio, df)
	return(pval)
	}



	## inverse normalization
	invnorm = function(x) {
	if(is.null(dim(x))) res = invnorm.vector(x) else
	res=apply(x,2,invnorm.vector)
	res
	}
	invnorm.vector = function(x) {yy = rank(x)/(length(x)+1); qnorm(yy)}

	## impute with column means
	impute.col.means = function(df)
	{
	meany = apply(df,2,mean,na.rm=TRUE)
	for(col in 1:ncol(df)) df[,col][is.na(df[,col])] = meany[col]
	df
	}

	## drop repeated vars but keeping first row for given var
	## may be better to use top_n()
	## example: widetempo %>% group_by(gene_id) %>% top_n(1,rank1)
	## but when there are ties, top_n() will take all rows rather than 1
	droprep = function(df,repvar,sortvar,decreasing=FALSE)
	{
	if(repvar %in% names(df))
	{
	if(sortvar %in% names(df))
	{
	tab = table(df[[repvar]])
	nametab = names(tab)
	uniquelist = nametab[tab==1]
	replist = nametab[tab>1]
	ind = df[[repvar]] %in% uniquelist
	res = df[ind,]
	tempo = df[!ind,]
	tempo = tempo[order(tempo[[sortvar]],decreasing=decreasing),]
	for(vv in replist)
	{
	rowa = tempo[tempo[[repvar]]==vv,][1,]
	res = rbind(res,rowa)
	}
	return(res)
	} else {warning(sortvar %&% ' (sortvar) not in data frame')}
	} else {warning(repvar %&% ' (repvar) not in data frame')}
	}

	## HLA genes
	## HLA Diversity in the 1000 Genomes Dataset
	## http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0097282
	## Table S1: MHC region definitions
	## MHC Sub-region Number of variants % of variants SNPs Gene Position
	## From to From to From To
	## Ext_CLASS_I 11,180 11% rs3130838 rs1122947 TRIM27 MOG 28,866,528 29,638,434
	## CLASS_I 42,718 42% rs375984 rs3131630 ZFP57 MICB 29,644,502 31,485,354
	## CLASS_III 8,033 8% rs4959079 rs416352 MCCD1 NOTCH4 31,488,879 32,207,393
	## CLASS_II 29,906 29% rs424232 rs3129223 AK123889 HLA-DPB2 32,208,324 33,113,197
	## Ext_CLASS_II 9,403 10% rs1003979 rs1547668 COL11A2 MLN 33,114,171 33,775,446
	## Total 103,310 100% DB snp UCSC UCSC HG19/Build37
	# hla.genelist = geneannot %>% filter(chr=='6') %>% filter(start > 28866528 & end < 33775446) %>% .[["ensgene"]]