John McCallum 15 July 2016
- work out tidiest way to split up haplotypes from phased vcf in data.table for plotting haplotypes
library(data.table)
## Warning: package 'data.table' was built under R version 3.1.3
library(splitstackshape)
## Warning: package 'splitstackshape' was built under R version 3.1.2
DT <- fread('~/Dropbox/test.phased.vcf')
DT[1:5]
## #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CK51_02 CK51_05
## 1: CHR9 7 . G A . PASS . GT 0|1 0|0
## 2: CHR9 15 . AACCC AACA,CAGCA . PASS . GT 2|0 2|0
## 3: CHR9 20 . AG AA . PASS . GT 0|0 0|0
## 4: CHR9 27 . G C . PASS . GT 0|0 0|0
## 5: CHR9 36 . C T . PASS . GT 0|0 0|0
## CK51_06 CK51_09 CK51_11 Hort16A Hort22d Russell T03.51-11-28f
## 1: 0|0 0|0 0|0 0|0 0|0 0|0 0|0
## 2: 0|1 0|1 1|0 1|0 1|0 0|1 1|0
## 3: 0|1 0|1 1|0 1|0 1|0 0|1 0|0
## 4: 0|1 0|1 1|0 0|0 1|0 0|1 0|0
## 5: 0|1 1|0 1|0 0|1 1|0 1|0 0|0
## T94.30-03-10f T94.30-04-08b T94.30-04-08c T94.30-04-08d
## 1: 0|0 0|0 0|0 0|0
## 2: 0|0 1|0 0|1 1|0
## 3: 0|0 1|0 0|1 1|0
## 4: 0|0 1|0 0|0 1|0
## 5: 1|0 1|0 1|0 1|0
Want to use tstrsplit to split by pipe
DT[1:5,tstrsplit(Hort22d,"|",fixed=TRUE)]
## V1 V2
## 1: 0 0
## 2: 1 0
## 3: 1 0
## 4: 1 0
## 5: 1 0
- suss out how to use .SD & .SDcols using print
DT[1:5,print(.SD),.SDcols=c(10:ncol(DT))]
## CK51_02 CK51_05 CK51_06 CK51_09 CK51_11 Hort16A Hort22d Russell
## 1: 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0
## 2: 2|0 2|0 0|1 0|1 1|0 1|0 1|0 0|1
## 3: 0|0 0|0 0|1 0|1 1|0 1|0 1|0 0|1
## 4: 0|0 0|0 0|1 0|1 1|0 0|0 1|0 0|1
## 5: 0|0 0|0 0|1 1|0 1|0 0|1 1|0 1|0
## T03.51-11-28f T94.30-03-10f T94.30-04-08b T94.30-04-08c T94.30-04-08d
## 1: 0|0 0|0 0|0 0|0 0|0
## 2: 1|0 0|0 1|0 0|1 1|0
## 3: 0|0 0|0 1|0 0|1 1|0
## 4: 0|0 0|0 1|0 0|0 1|0
## 5: 0|0 1|0 1|0 1|0 1|0
## NULL
ncol(DT)
## [1] 22
- try this out then drop the new columns
DT[1:5,c('Hort22d_1','Hort22d_2') := tstrsplit(Hort22d,"|",fixed=TRUE)][1:5]
## #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CK51_02 CK51_05
## 1: CHR9 7 . G A . PASS . GT 0|1 0|0
## 2: CHR9 15 . AACCC AACA,CAGCA . PASS . GT 2|0 2|0
## 3: CHR9 20 . AG AA . PASS . GT 0|0 0|0
## 4: CHR9 27 . G C . PASS . GT 0|0 0|0
## 5: CHR9 36 . C T . PASS . GT 0|0 0|0
## CK51_06 CK51_09 CK51_11 Hort16A Hort22d Russell T03.51-11-28f
## 1: 0|0 0|0 0|0 0|0 0|0 0|0 0|0
## 2: 0|1 0|1 1|0 1|0 1|0 0|1 1|0
## 3: 0|1 0|1 1|0 1|0 1|0 0|1 0|0
## 4: 0|1 0|1 1|0 0|0 1|0 0|1 0|0
## 5: 0|1 1|0 1|0 0|1 1|0 1|0 0|0
## T94.30-03-10f T94.30-04-08b T94.30-04-08c T94.30-04-08d Hort22d_1
## 1: 0|0 0|0 0|0 0|0 0
## 2: 0|0 1|0 0|1 1|0 1
## 3: 0|0 1|0 0|1 1|0 1
## 4: 0|0 1|0 0|0 1|0 1
## 5: 1|0 1|0 1|0 1|0 1
## Hort22d_2
## 1: 0
## 2: 0
## 3: 0
## 4: 0
## 5: 0
DT[,c('Hort22d_1','Hort22d_2') := NULL]
- generate hap names like so..
samples <- names(DT)[10:ncol(DT)]
paste0(rep(samples, each=2), "_", c("1", "2"))
## [1] "CK51_02_1" "CK51_02_2" "CK51_05_1"
## [4] "CK51_05_2" "CK51_06_1" "CK51_06_2"
## [7] "CK51_09_1" "CK51_09_2" "CK51_11_1"
## [10] "CK51_11_2" "Hort16A_1" "Hort16A_2"
## [13] "Hort22d_1" "Hort22d_2" "Russell_1"
## [16] "Russell_2" "T03.51-11-28f_1" "T03.51-11-28f_2"
## [19] "T94.30-03-10f_1" "T94.30-03-10f_2" "T94.30-04-08b_1"
## [22] "T94.30-04-08b_2" "T94.30-04-08c_1" "T94.30-04-08c_2"
## [25] "T94.30-04-08d_1" "T94.30-04-08d_2"
DT[,paste0(rep(samples, each = 2),'_', 1:2) := sapply(.SD, tstrsplit, "|",fixed='TRUE'), .SDcols = samples]
DT[1:5,-samples,with=FALSE]
## #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CK51_02_1
## 1: CHR9 7 . G A . PASS . GT 0
## 2: CHR9 15 . AACCC AACA,CAGCA . PASS . GT 2
## 3: CHR9 20 . AG AA . PASS . GT 0
## 4: CHR9 27 . G C . PASS . GT 0
## 5: CHR9 36 . C T . PASS . GT 0
## CK51_02_2 CK51_05_1 CK51_05_2 CK51_06_1 CK51_06_2 CK51_09_1 CK51_09_2
## 1: 1 0 0 0 0 0 0
## 2: 0 2 0 0 1 0 1
## 3: 0 0 0 0 1 0 1
## 4: 0 0 0 0 1 0 1
## 5: 0 0 0 0 1 1 0
## CK51_11_1 CK51_11_2 Hort16A_1 Hort16A_2 Hort22d_1 Hort22d_2 Russell_1
## 1: 0 0 0 0 0 0 0
## 2: 1 0 1 0 1 0 0
## 3: 1 0 1 0 1 0 0
## 4: 1 0 0 0 1 0 0
## 5: 1 0 0 1 1 0 1
## Russell_2 T03.51-11-28f_1 T03.51-11-28f_2 T94.30-03-10f_1
## 1: 0 0 0 0
## 2: 1 1 0 0
## 3: 1 0 0 0
## 4: 1 0 0 0
## 5: 0 0 0 1
## T94.30-03-10f_2 T94.30-04-08b_1 T94.30-04-08b_2 T94.30-04-08c_1
## 1: 0 0 0 0
## 2: 0 1 0 0
## 3: 0 1 0 0
## 4: 0 1 0 0
## 5: 0 1 0 1
## T94.30-04-08c_2 T94.30-04-08d_1 T94.30-04-08d_2
## 1: 0 0 0
## 2: 1 1 0
## 3: 1 1 0
## 4: 0 1 0
## 5: 0 1 0
see http://stackoverflow.com/questions/31999002/split-string-in-each-column-for-several-columns
easy-peasy
cSplit(DT[1:5], samples, sep = "|", type.convert = FALSE)
## #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CK51_02_1
## 1: CHR9 7 . G A . PASS . GT 0
## 2: CHR9 15 . AACCC AACA,CAGCA . PASS . GT 2
## 3: CHR9 20 . AG AA . PASS . GT 0
## 4: CHR9 27 . G C . PASS . GT 0
## 5: CHR9 36 . C T . PASS . GT 0
## CK51_02_2 CK51_05_1 CK51_05_2 CK51_06_1 CK51_06_2 CK51_09_1 CK51_09_2
## 1: 1 0 0 0 0 0 0
## 2: 0 2 0 0 1 0 1
## 3: 0 0 0 0 1 0 1
## 4: 0 0 0 0 1 0 1
## 5: 0 0 0 0 1 1 0
## CK51_11_1 CK51_11_2 Hort16A_1 Hort16A_2 Hort22d_1 Hort22d_2 Russell_1
## 1: 0 0 0 0 0 0 0
## 2: 1 0 1 0 1 0 0
## 3: 1 0 1 0 1 0 0
## 4: 1 0 0 0 1 0 0
## 5: 1 0 0 1 1 0 1
## Russell_2 T03.51-11-28f_1 T03.51-11-28f_2 T94.30-03-10f_1
## 1: 0 0 0 0
## 2: 1 1 0 0
## 3: 1 0 0 0
## 4: 1 0 0 0
## 5: 0 0 0 1
## T94.30-03-10f_2 T94.30-04-08b_1 T94.30-04-08b_2 T94.30-04-08c_1
## 1: 0 0 0 0
## 2: 0 1 0 0
## 3: 0 1 0 0
## 4: 0 1 0 0
## 5: 0 1 0 1
## T94.30-04-08c_2 T94.30-04-08d_1 T94.30-04-08d_2 CK51_02_1 CK51_02_2
## 1: 0 0 0 0 1
## 2: 1 1 0 2 0
## 3: 1 1 0 0 0
## 4: 0 1 0 0 0
## 5: 0 1 0 0 0
## CK51_05_1 CK51_05_2 CK51_06_1 CK51_06_2 CK51_09_1 CK51_09_2 CK51_11_1
## 1: 0 0 0 0 0 0 0
## 2: 2 0 0 1 0 1 1
## 3: 0 0 0 1 0 1 1
## 4: 0 0 0 1 0 1 1
## 5: 0 0 0 1 1 0 1
## CK51_11_2 Hort16A_1 Hort16A_2 Hort22d_1 Hort22d_2 Russell_1 Russell_2
## 1: 0 0 0 0 0 0 0
## 2: 0 1 0 1 0 0 1
## 3: 0 1 0 1 0 0 1
## 4: 0 0 0 1 0 0 1
## 5: 0 0 1 1 0 1 0
## T03.51-11-28f_1 T03.51-11-28f_2 T94.30-03-10f_1 T94.30-03-10f_2
## 1: 0 0 0 0
## 2: 1 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 0 1 0
## T94.30-04-08b_1 T94.30-04-08b_2 T94.30-04-08c_1 T94.30-04-08c_2
## 1: 0 0 0 0
## 2: 1 0 0 1
## 3: 1 0 0 1
## 4: 1 0 0 0
## 5: 1 0 1 0
## T94.30-04-08d_1 T94.30-04-08d_2
## 1: 0 0
## 2: 1 0
## 3: 1 0
## 4: 1 0
## 5: 1 0