Skip to content

Instantly share code, notes, and snippets.

@dbolser-ebi
Last active May 17, 2017 16:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dbolser-ebi/21ca0a6ec094a6429ded698416d61895 to your computer and use it in GitHub Desktop.
Save dbolser-ebi/21ca0a6ec094a6429ded698416d61895 to your computer and use it in GitHub Desktop.
> desc variation_feature;
...
| consequence_types | set('intergenic_variant','splice_acceptor_variant','splice_donor_variant','stop_lost','coding_sequence_variant','missense_variant','stop_gained','synonymous_variant','frameshift_variant','non_coding_transcript_variant','non_coding_transcript_exon_variant','mature_miRNA_variant','NMD_transcript_variant','5_prime_UTR_variant','3_prime_UTR_variant','incomplete_terminal_codon_variant','intron_variant','splice_region_variant','downstream_gene_variant','upstream_gene_variant','start_lost','stop_retained_variant','inframe_insertion','inframe_deletion','transcript_ablation','transcript_fusion','transcript_amplification','transcript_translocation','TFBS_ablation','TFBS_fusion','TFBS_amplification','TFBS_translocation','regulatory_region_ablation','regulatory_region_fusion','regulatory_region_amplification','regulatory_region_translocation','feature_elongation','feature_truncation','regulatory_region_variant','TF_binding_site_variant','protein_altering_variant') | NO | MUL | intergenic_variant | |
...
## Parsing
DESC variation_feature;
## gives:
1 'intergenic_variant',
2 'splice_acceptor_variant',
3 'splice_donor_variant',
4 'stop_lost',
5 'coding_sequence_variant',
6 'missense_variant',
7 'stop_gained',
8 'synonymous_variant',
9 'frameshift_variant',
10 'non_coding_transcript_variant',
11 'non_coding_transcript_exon_variant',
12 'mature_miRNA_variant',
13 'NMD_transcript_variant',
14 '5_prime_UTR_variant',
...
My attempted query...
SELECT
COUNT(*) AS N,
COUNT(DISTINCT variation_feature_id) AS U,
SUM( IF(consequence_types & 1, 1, 0)) AS intergenic_variant,
SUM( IF(consequence_types & 2, 1, 0)) AS splice_acceptor_variant,
SUM( IF(consequence_types & 4, 1, 0)) AS splice_donor_variant
FROM
transcript_variation
Which gives:
+---------+---------+----------------------+---------------------------+------------------------+
| N | U | intergenic_variant | splice_acceptor_variant | splice_donor_variant |
|---------+---------+----------------------+---------------------------+------------------------|
| 1721493 | 1524687 | 1262 | 1321 | 879 |
+---------+---------+----------------------+---------------------------+------------------------+
1 row in set
Time: 6.353s
SELECT
COUNT(*) AS N,
COUNT(DISTINCT variation_feature_id) AS U,
COUNT(DISTINCT IF(consequence_types & 1, variation_feature_id, NULL)) AS intergenic_variant,
COUNT(DISTINCT IF(consequence_types & 2, variation_feature_id, NULL)) AS splice_acceptor_variant,
COUNT(DISTINCT IF(consequence_types & 4, variation_feature_id, NULL)) AS splice_donor_variant
FROM
transcript_variation;
+---------+---------+----------------------+---------------------------+------------------------+
| N | U | intergenic_variant | splice_acceptor_variant | splice_donor_variant |
|---------+---------+----------------------+---------------------------+------------------------|
| 1721493 | 1524687 | 1245 | 1310 | 867 |
+---------+---------+----------------------+---------------------------+------------------------+
1 row in set
Time: 4.389s
BUT:
> SELECT COUNT(*) AS N, COUNT(DISTINCT variation_feature_id) AS U FROM transcript_variation WHERE FIND_IN_SET('intergenic_variant', consequence_types);
+-----+-----+
| N | U |
|-----+-----|
| 0 | 0 |
+-----+-----+
1 row in set
Time: 2.069s
> SELECT COUNT(*) AS N, COUNT(DISTINCT variation_feature_id) AS U FROM transcript_variation WHERE FIND_IN_SET('splice_acceptor_variant', consequence_types);
+------+------+
| N | U |
|------+------|
| 1262 | 1245 |
+------+------+
1 row in set
Time: 2.227s
> SELECT COUNT(*) AS N, COUNT(DISTINCT variation_feature_id) AS U FROM transcript_variation WHERE FIND_IN_SET('splice_donor_variant', consequence_types);
+------+------+
| N | U |
|------+------|
| 1321 | 1310 |
+------+------+
1 row in set
Time: 2.257s
@dbolser-ebi
Copy link
Author

show create table variation_feature;
+-------------------+----------------+
| Table | Create Table |
|-------------------+----------------|
| variation_feature | CREATE TABLE variation_feature (
variation_feature_id int(10) unsigned NOT NULL AUTO_INCREMENT,
seq_region_id int(10) unsigned NOT NULL,
seq_region_start int(11) NOT NULL,
seq_region_end int(11) NOT NULL,
seq_region_strand tinyint(4) NOT NULL,
variation_id int(10) unsigned NOT NULL,
allele_string varchar(50000) DEFAULT NULL,
variation_name varchar(255) DEFAULT NULL,
map_weight int(11) NOT NULL,
flags set('genotyped') DEFAULT NULL,
source_id int(10) unsigned NOT NULL,
consequence_types set('intergenic_variant','splice_acceptor_variant','splice_donor_variant','stop_lost','coding_sequence_variant','missense_variant','stop_gained','synonymous_variant','frameshift_variant','non_coding_transcript_variant','non_coding_transcript_exon_variant','mature_miRNA_variant','NMD_transcript_variant','5_prime_UTR_variant','3_prime_UTR_variant','incomplete_terminal_codon_variant','intron_variant','splice_region_variant','downstream_gene_variant','upstream_gene_variant','start_lost','stop_retained_variant','inframe_insertion','inframe_deletion','transcript_ablation','transcript_fusion','transcript_amplification','transcript_translocation','TFBS_ablation','TFBS_fusion','TFBS_amplification','TFBS_translocation','regulatory_region_ablation','regulatory_region_fusion','regulatory_region_amplification','regulatory_region_translocation','feature_elongation','feature_truncation','regulatory_region_variant','TF_binding_site_variant','protein_altering_variant') NOT NULL DEFAULT 'intergenic_variant',
variation_set_id set('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','43','44','45','46','47','48','49','50','51','52','53','54','55','56','57','58','59','60','61','62','63','64') NOT NULL DEFAULT '',
class_attrib_id int(10) unsigned DEFAULT '0',
somatic tinyint(1) NOT NULL DEFAULT '0',
minor_allele varchar(50) DEFAULT NULL,
minor_allele_freq float DEFAULT NULL,
minor_allele_count int(10) unsigned DEFAULT NULL,
alignment_quality double DEFAULT NULL,
clinical_significance set('uncertain significance','not provided','benign','likely benign','likely pathogenic','pathogenic','drug response','histocompatibility','other','confers sensitivity','risk factor','association','protective') DEFAULT NULL,
evidence_attribs set('367','368','369','370','371','372','418','421') DEFAULT NULL,
display int(1) DEFAULT '1',
PRIMARY KEY (variation_feature_id),
KEY variation_idx (variation_id),
KEY pos_idx (seq_region_id,seq_region_start,seq_region_end),
KEY variation_set_idx (variation_set_id),
KEY consequence_type_idx (consequence_types),
KEY source_idx (source_id)
) ENGINE=MyISAM AUTO_INCREMENT=5230594 DEFAULT CHARSET=latin1 CHECKSUM=1 |
+-------------------+----------------+
1 row in set
Time: 0.002s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment