Skip to content

Instantly share code, notes, and snippets.

View stephenturner's full-sized avatar

Stephen Turner stephenturner

View GitHub Profile
@stephenturner
stephenturner / 2011-04-12 union nomismatch priority.sql
Created April 13, 2011 00:21
2011-04-12 union nomismatch prioritypriority.sql
DROP TABLE IF EXISTS union_nomismatch;
CREATE TABLE union_nomismatch
SELECT "12" as chrom, pos18, pos19, if(alleles_study IS NOT NULL, alleles_study, alleles_1000g) as alleles, maf1000g, maxmaf, meanmaf, novel, offtarget, thresh, highqual, superhighqual,
CASE
WHEN meanmaf IS NULL THEN maf1000g
WHEN maf1000g IS NULL THEN meanmaf
WHEN (meanmaf IS NOT NULL and maf1000g IS NOT NULL) THEN GREATEST(meanmaf, maf1000g)
END AS bestmaf
FROM (
SELECT
@stephenturner
stephenturner / 2011-04-12 get flanking seq for igf1.r
Created April 13, 2011 01:24
2011-04-12 get flanking seq for igf1.r
## Load the BSgenome package and download the hg19 reference sequence
## http://www.bioconductor.org/packages/release/bioc/html/BSgenome.html
# source("http://www.bioconductor.org/biocLite.R")
# biocLite("BSgenome")
# biocLite("BSgenome.Hsapiens.UCSC.hg19")
library('BSgenome.Hsapiens.UCSC.hg19')
installed.genomes()
getflank <- function(position, alleles="[N/N]", chr="chr12", offset=10) {
@stephenturner
stephenturner / 2011-04-15 SNP Overlap.sql
Created April 15, 2011 22:33
2011-04-15 SNP Overlap.sql
#---------------
# Sample overlap
#---------------
# Samples where in_gwas=1. n=2503
SELECT count(*) FROM igf1.sample_aapc_gwas where in_gwas=1;
# Samples where in_gwas=1 and that are actually in the GWAS data. n=2130.
SELECT count(*) FROM igf1.sample_aapc_gwas a JOIN mec.sample_genotyped b on a.m_id=b.m_id where in_gwas=1;
# Stephen Turner
# http://StephenTurner.us/
# http://GettingGeneticsDone.blogspot.com/
# See license at http://gettinggeneticsdone.blogspot.com/p/copyright.html
# Last updated: Tuesday, April 19, 2011
# R code for making manhattan plots and QQ plots from plink output files.
# manhattan() with GWAS data this can take a lot of memory, recommended for use on 64bit machines only, for now.
# Altnernatively, use bmanhattan() , i.e., base manhattan. uses base graphics. way faster.
@stephenturner
stephenturner / picking snps for pca.r
Created April 20, 2011 00:44
picking snps for pca.r
#-------------------------------------------------------------------------------------
# If you split the dataset (3195094 snps) into 320 files of 10,000 snps
# (the last file will have 5094 snps), then you can use this code to generate
# a "map" that will map the original column numbers (1:3195094) to a file number
# (1:1000) and a column number (1:10000).
#-------------------------------------------------------------------------------------
# nfiles <- 320
# ncolsperfile <- 10000
# ncols <- nfiles*ncolsperfile
# column <- 1:ncols
@stephenturner
stephenturner / PCA on 4k markers.r
Created April 20, 2011 00:49
PCA on 4k markers.r
#-----------------------------------------------------------------------------------------------
#
# Calculate PCs when the number of SNPs (m>4000) is large
# From G with SNPs on the columns, keep the first 20 PCs
# X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix)
# X'X can't be handled directly.
# INSTEAD,
# If XX'v= lv, X'XX'v = lX'v => if find eigenv for XX' then X'v is eigenv for X'X
# SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want
# OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant
@stephenturner
stephenturner / pca plot results.r
Created April 20, 2011 00:50
pca plot results.r
load("D:/2011-02-09 MEC GWAS DATA/2011-03-23 PCA/pcs-2-without-d.RData")
eigenvalues <- pcs[[2]]
pcs <- data.frame(pcs[[1]])
pcs <- cbind(query("select * from mec_genotyped"),pcs)
ht(pcs)
dbWriteTable(con, "pcs", pcs, overwrite=T)
d<-read.csv("D:/2011-02-09 MEC GWAS DATA/PHENOTYPE DATA/2011-03-23 initial data from christian.csv",T)
dbWriteTable(con, "mec_pheno", d, overwrite=T)
-- Sample overlap
-- Samples where in_gwas=1. n=2503
SELECT count(*) FROM igf1.sample_aapc_gwas where in_gwas=1;
-- Samples where in_gwas=1 and that are actually in the GWAS data. n=2130.
SELECT count(*) FROM igf1.sample_aapc_gwas a JOIN mec.sample_genotyped b on a.m_id=b.m_id where in_gwas=1;
-- What about the missing 373?
SELECT a.* FROM igf1.sample_aapc_gwas a LEFT JOIN mec.sample_genotyped b on a.m_id=b.m_id WHERE in_gwas=1 AND b.m_id IS NULL;
@stephenturner
stephenturner / load and count whr.sql
Created June 4, 2011 02:03
load and count whr.sql
drop table pheno4;
create table pheno4 (sampleindex int, m_id varchar(25), area varchar(1), q1_age_cohent int, q3prelim_waist int, q3prelim_hip int, age_calc float);
load data local infile 'C:/cygwin/home/sturner/mec_gwas/phenotypes/waisthip.csv'
into table pheno4
fields terminated by ','
ignore 1 lines
(sampleindex, m_id, area, q1_age_cohent, q3prelim_waist, q3prelim_hip, age_calc)
;
alter table pheno4 add column q3prelim_whr float after q3prelim_hip;
update pheno4 set q3prelim_whr=q3prelim_waist/q3prelim_hip;
@stephenturner
stephenturner / iona-pull-711-snps-priority-3.r
Created June 21, 2011 18:55
iona-pull-711-snps-priority-3.r
d <- query(" #new
SELECT
if(b.snp='.' OR b.snp IS NULL, CONCAT_WS('_','chr12',a.pos19), b.snp) as Locus_Name,
'SNP' as Target_Type,
NULL as Sequence,
a.chrom AS Chromosome,
a.pos19 as Coordinate,
'19' as Genome_Build_Version,
'HSapiensReferenceSequence' as Source,
'19' as Source_version,