Last active
July 19, 2023 13:46
-
-
Save opplatek/a7fa48b68a42f7f88ded068937ef5b23 to your computer and use it in GitHub Desktop.
Extract column by name from another column split by separator using AWK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We have a file that has a column with multiple additional "subcolumns" separated by the ; character | |
# We want to extract a specific subcolumn from the "merged" column that starts with a specific word | |
# Get 1000 Genomes project VCF | |
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/integrated_sv_map/supporting/GRCh38_positions/ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz | |
# This VCF has a INFO column (column no. 8) that stores additonal information | |
zcat ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz | grep -v "^#"| cut -f8 | head | |
#AC=35;AF=0.00698882;AFR_AF=0;AMR_AF=0.0072;AN=5008;CS=ALU_umary;EAS_AF=0.0069;EUR_AF=0.0189;MEINFO=AluYa4_5,1,223,-;NS=2504;SAS_AF=0.0041;SITEPOST=0.9998;SVLEN=222;SVTYPE=ALU;TSD=null | |
#AC=64;AF=0.0127795;AFR_AF=0.0015;AMR_AF=0;AN=5008;CIEND=-150,150;CIPOS=-150,150;CS=DUP_delly;EAS_AF=0.0595;END=914824;EUR_AF=0.001;IMPRECISE;NS=2504;SAS_AF=0.001;SITEPOST=1;SVTYPE=DUP | |
#AC=3,206;AF=0.00059904,0.0411342;AFR_AF=0,0.0303;AMR_AF=0.0014,0.0259;AN=5008;CS=DUP_gs;EAS_AF=0.001,0.0615;END=820586;EUR_AF=0.001,0.0417;NS=2504;SAS_AF=0,0.045;SITEPOST=1;SVTYPE=CNV | |
#AC=1;AF=0.00019968;AFR_AF=0;AMR_AF=0;AN=5008;CIEND=0,354;CIPOS=-348,0;CS=DEL_union;EAS_AF=0.001;END=806640;EUR_AF=0;MC=BI_GS_DEL1_B4_P0001_3;NS=2504;SAS_AF=0;SVTYPE=DEL | |
#AC=188;AF=0.0375399;AFR_AF=0.0151;AMR_AF=0.062;AN=5008;CIEND=0,403;CIPOS=-385,0;CS=DEL_union;EAS_AF=0.0923;END=833732;EUR_AF=0.001;MC=EM_DL_DEL00011,YL_CN_CDX_8,BI_GS_DEL1_B1_P0001_18,SI_BD_2;NS=2504;SAS_AF=0.0317;SVTYPE=DEL | |
# As you can see, this column has many "subcolumns" separate by ";"; each of the subcolumn starts with some prefix (~name) | |
# These names are not on the same position for all the rows | |
# To get only subcolumn starting with "SVTYPE=", you can use AWK | |
zcat ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz | cut -f8 \ | |
| awk -F";" '{for(j=1;j<=NF;j++){if($j~/^SVTYPE+/){print $j}}}' | |
# You can also combine it with sort | uniq -c to get number of subcolumns with the same values | |
zcat ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz | cut -f8 \ | |
| awk -F";" '{for(j=1;j<=NF;j++){if($j~/^SVTYPE+/){print $j}}}' | sort | uniq -c | |
# 12743 SVTYPE=ALU | |
# 2899 SVTYPE=CNV | |
# 40922 SVTYPE=DEL | |
# 1231 SVTYPE=DEL_ALU | |
# 1 SVTYPE=DEL_HERV | |
# 56 SVTYPE=DEL_LINE1 | |
# 9 SVTYPE=DEL_SVA | |
# 6006 SVTYPE=DUP | |
# 162 SVTYPE=INS | |
# 786 SVTYPE=INV | |
# 3047 SVTYPE=LINE1 | |
# 835 SVTYPE=SVA | |
# You can also do this with a variable (and sort it) | |
field="SVTYPE" | |
zcat ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz | cut -f8 \ | |
| awk -F";" '{for(j=1;j<=NF;j++){if($j~/^'$field'+/){print $j}}}' | awk -F"=" '{print $2}' | sort | uniq -c | sort -k1,1r | |
# 40922 DEL | |
# 12743 ALU | |
# 6006 DUP | |
# 3047 LINE1 | |
# 2899 CNV | |
# 1231 DEL_ALU | |
# 835 SVA | |
# 786 INV | |
# 162 INS | |
# 56 DEL_LINE1 | |
# 9 DEL_SVA | |
# 1 DEL_HERV |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment