Stephen Turner stephenturner

## 2011-04-12 union nomismatch priority.sql
DROP TABLE IF EXISTS union_nomismatch;
CREATE TABLE union_nomismatch
SELECT "12" as chrom, pos18, pos19, if(alleles_study IS NOT NULL, alleles_study, alleles_1000g) as alleles, maf1000g, maxmaf, meanmaf, novel, offtarget, thresh, highqual, superhighqual,
    CASE
        WHEN meanmaf IS NULL THEN maf1000g
        WHEN maf1000g IS NULL THEN meanmaf
        WHEN (meanmaf IS NOT NULL and maf1000g IS NOT NULL) THEN GREATEST(meanmaf, maf1000g)
    END AS bestmaf
FROM (
    SELECT

## 2011-04-12 get flanking seq for igf1.r
## Load the BSgenome package and download the hg19 reference sequence
## http://www.bioconductor.org/packages/release/bioc/html/BSgenome.html
# source("http://www.bioconductor.org/biocLite.R")
# biocLite("BSgenome")
# biocLite("BSgenome.Hsapiens.UCSC.hg19")

library('BSgenome.Hsapiens.UCSC.hg19')
installed.genomes()

getflank <- function(position, alleles="[N/N]", chr="chr12", offset=10) {

## 2011-04-15 SNP Overlap.sql
#---------------
# Sample overlap
#---------------

# Samples where in_gwas=1. n=2503
SELECT count(*) FROM igf1.sample_aapc_gwas where in_gwas=1;

# Samples where in_gwas=1 and that are actually in the GWAS data. n=2130.
SELECT count(*) FROM igf1.sample_aapc_gwas a JOIN mec.sample_genotyped b on a.m_id=b.m_id where in_gwas=1;

## qqman.r
# Stephen Turner
# http://StephenTurner.us/
# http://GettingGeneticsDone.blogspot.com/
# See license at http://gettinggeneticsdone.blogspot.com/p/copyright.html

# Last updated: Tuesday, April 19, 2011
# R code for making manhattan plots and QQ plots from plink output files.
# manhattan() with GWAS data this can take a lot of memory, recommended for use on 64bit machines only, for now.
# Altnernatively, use bmanhattan() , i.e., base manhattan. uses base graphics. way faster.

## picking snps for pca.r
#-------------------------------------------------------------------------------------
#   If you split the dataset (3195094 snps) into 320 files of 10,000 snps
#   (the last file will have 5094 snps), then you can use this code to generate
#   a "map" that will map the original column numbers (1:3195094) to a file number
#   (1:1000) and a column number (1:10000).
#-------------------------------------------------------------------------------------
# nfiles <- 320
# ncolsperfile <- 10000
# ncols <- nfiles*ncolsperfile
# column <- 1:ncols

## PCA on 4k markers.r
#-----------------------------------------------------------------------------------------------
#
#  	Calculate PCs when the number of SNPs (m>4000) is large
#  	From G with SNPs on the columns, keep the first 20 PCs
#	X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix)
#	X'X can't be handled directly.
#	INSTEAD,
#	     	If XX'v= lv, X'XX'v = lX'v  => if find eigenv for XX' then X'v is eigenv for X'X
#		SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want
#			OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant

## pca plot results.r
load("D:/2011-02-09 MEC GWAS DATA/2011-03-23 PCA/pcs-2-without-d.RData")
eigenvalues <- pcs[[2]]
pcs <- data.frame(pcs[[1]])
pcs <- cbind(query("select * from mec_genotyped"),pcs)
ht(pcs)
dbWriteTable(con, "pcs", pcs, overwrite=T)

d<-read.csv("D:/2011-02-09 MEC GWAS DATA/PHENOTYPE DATA/2011-03-23 initial data from christian.csv",T)
dbWriteTable(con, "mec_pheno", d, overwrite=T)

## 2011-04-14 GWAS Overlap.sql2011-04-14 GWAS Overlap.sql
-- Sample overlap

--  Samples where in_gwas=1. n=2503
SELECT count(*) FROM igf1.sample_aapc_gwas where in_gwas=1;

-- Samples where in_gwas=1 and that are actually in the GWAS data. n=2130.
SELECT count(*) FROM igf1.sample_aapc_gwas a JOIN mec.sample_genotyped b on a.m_id=b.m_id where in_gwas=1;

-- What about the missing 373?
SELECT a.* FROM igf1.sample_aapc_gwas a LEFT JOIN mec.sample_genotyped b on a.m_id=b.m_id WHERE in_gwas=1 AND b.m_id IS NULL;

## load and count whr.sql
drop table pheno4;
create table pheno4 (sampleindex int, m_id varchar(25), area varchar(1), q1_age_cohent int, q3prelim_waist int, q3prelim_hip int, age_calc float);
load data local infile 'C:/cygwin/home/sturner/mec_gwas/phenotypes/waisthip.csv'
    into table pheno4
    fields terminated by ','
    ignore 1 lines
    (sampleindex, m_id, area, q1_age_cohent, q3prelim_waist, q3prelim_hip, age_calc)
;
alter table pheno4 add column q3prelim_whr float after q3prelim_hip;
update pheno4 set q3prelim_whr=q3prelim_waist/q3prelim_hip;

## iona-pull-711-snps-priority-3.r
d <- query(" #new
SELECT
    if(b.snp='.' OR b.snp IS NULL, CONCAT_WS('_','chr12',a.pos19), b.snp) as Locus_Name,
    'SNP' as Target_Type,
    NULL as Sequence,
    a.chrom AS Chromosome,
    a.pos19 as Coordinate,
    '19' as Genome_Build_Version,
    'HSapiensReferenceSequence' as Source,
    '19' as Source_version,
	DROP TABLE IF EXISTS union_nomismatch;
	CREATE TABLE union_nomismatch
	SELECT "12" as chrom, pos18, pos19, if(alleles_study IS NOT NULL, alleles_study, alleles_1000g) as alleles, maf1000g, maxmaf, meanmaf, novel, offtarget, thresh, highqual, superhighqual,
	CASE
	WHEN meanmaf IS NULL THEN maf1000g
	WHEN maf1000g IS NULL THEN meanmaf
	WHEN (meanmaf IS NOT NULL and maf1000g IS NOT NULL) THEN GREATEST(meanmaf, maf1000g)
	END AS bestmaf
	FROM (
	SELECT
	## Load the BSgenome package and download the hg19 reference sequence
	## http://www.bioconductor.org/packages/release/bioc/html/BSgenome.html
	# source("http://www.bioconductor.org/biocLite.R")
	# biocLite("BSgenome")
	# biocLite("BSgenome.Hsapiens.UCSC.hg19")

	library('BSgenome.Hsapiens.UCSC.hg19')
	installed.genomes()

	getflank <- function(position, alleles="[N/N]", chr="chr12", offset=10) {
	#---------------
	# Sample overlap
	#---------------

	# Samples where in_gwas=1. n=2503
	SELECT count(*) FROM igf1.sample_aapc_gwas where in_gwas=1;

	# Samples where in_gwas=1 and that are actually in the GWAS data. n=2130.
	SELECT count(*) FROM igf1.sample_aapc_gwas a JOIN mec.sample_genotyped b on a.m_id=b.m_id where in_gwas=1;
	# Stephen Turner
	# http://StephenTurner.us/
	# http://GettingGeneticsDone.blogspot.com/
	# See license at http://gettinggeneticsdone.blogspot.com/p/copyright.html

	# Last updated: Tuesday, April 19, 2011
	# R code for making manhattan plots and QQ plots from plink output files.
	# manhattan() with GWAS data this can take a lot of memory, recommended for use on 64bit machines only, for now.
	# Altnernatively, use bmanhattan() , i.e., base manhattan. uses base graphics. way faster.
	#-------------------------------------------------------------------------------------
	# If you split the dataset (3195094 snps) into 320 files of 10,000 snps
	# (the last file will have 5094 snps), then you can use this code to generate
	# a "map" that will map the original column numbers (1:3195094) to a file number
	# (1:1000) and a column number (1:10000).
	#-------------------------------------------------------------------------------------
	# nfiles <- 320
	# ncolsperfile <- 10000
	# ncols <- nfiles*ncolsperfile
	# column <- 1:ncols
	#-----------------------------------------------------------------------------------------------
	#
	# Calculate PCs when the number of SNPs (m>4000) is large
	# From G with SNPs on the columns, keep the first 20 PCs
	# X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix)
	# X'X can't be handled directly.
	# INSTEAD,
	# If XX'v= lv, X'XX'v = lX'v => if find eigenv for XX' then X'v is eigenv for X'X
	# SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want
	# OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant
	load("D:/2011-02-09 MEC GWAS DATA/2011-03-23 PCA/pcs-2-without-d.RData")
	eigenvalues <- pcs[[2]]
	pcs <- data.frame(pcs[[1]])
	pcs <- cbind(query("select * from mec_genotyped"),pcs)
	ht(pcs)
	dbWriteTable(con, "pcs", pcs, overwrite=T)

	d<-read.csv("D:/2011-02-09 MEC GWAS DATA/PHENOTYPE DATA/2011-03-23 initial data from christian.csv",T)
	dbWriteTable(con, "mec_pheno", d, overwrite=T)
	-- Sample overlap

	-- Samples where in_gwas=1. n=2503
	SELECT count(*) FROM igf1.sample_aapc_gwas where in_gwas=1;

	-- Samples where in_gwas=1 and that are actually in the GWAS data. n=2130.
	SELECT count(*) FROM igf1.sample_aapc_gwas a JOIN mec.sample_genotyped b on a.m_id=b.m_id where in_gwas=1;

	-- What about the missing 373?
	SELECT a.* FROM igf1.sample_aapc_gwas a LEFT JOIN mec.sample_genotyped b on a.m_id=b.m_id WHERE in_gwas=1 AND b.m_id IS NULL;
	drop table pheno4;
	create table pheno4 (sampleindex int, m_id varchar(25), area varchar(1), q1_age_cohent int, q3prelim_waist int, q3prelim_hip int, age_calc float);
	load data local infile 'C:/cygwin/home/sturner/mec_gwas/phenotypes/waisthip.csv'
	into table pheno4
	fields terminated by ','
	ignore 1 lines
	(sampleindex, m_id, area, q1_age_cohent, q3prelim_waist, q3prelim_hip, age_calc)
	;
	alter table pheno4 add column q3prelim_whr float after q3prelim_hip;
	update pheno4 set q3prelim_whr=q3prelim_waist/q3prelim_hip;
	d <- query(" #new
	SELECT
	if(b.snp='.' OR b.snp IS NULL, CONCAT_WS('_','chr12',a.pos19), b.snp) as Locus_Name,
	'SNP' as Target_Type,
	NULL as Sequence,
	a.chrom AS Chromosome,
	a.pos19 as Coordinate,
	'19' as Genome_Build_Version,
	'HSapiensReferenceSequence' as Source,
	'19' as Source_version,