Last active
August 29, 2015 14:05
-
-
Save hartzell/340be49b42e20a76459b to your computer and use it in GitHub Desktop.
Build a table of exons that belong to more than one mRNA (exon name, {list of mRNA names}). Written as an excuse to play with common table expressions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- An exercise in Common Table Expessions. There are probably | |
-- faster solutions.... | |
-- Run this against Flybase and discover the exons that belong | |
-- to more than one transcripts. | |
-- exon_name | transcripts | |
-- 14-3-3epsilon:1 | {14-3-3epsilon-RB,14-3-3epsilon-RD} | |
-- 14-3-3epsilon:2 | {14-3-3epsilon-RA,14-3-3epsilon-RC} | |
-- 14-3-3epsilon:3 | {14-3-3epsilon-RA,14-3-3epsilon-RB,14-3-3... | |
with | |
-- collect a bit of cvterm info | |
exon(cvterm_id, name) as | |
(select cvterm_id, name from cvterm where name = 'exon'), | |
mrna(cvterm_id, name) as | |
(select cvterm_id, name from cvterm where name = 'mRNA'), | |
partof(cvterm_id, name) as | |
(select cvterm_id, name from cvterm where name = 'partof'), | |
-- table full of (exon, partof, mrna)-ish tuples | |
features(e_feature_id, e_name, partof_term_name, m_feature_id, m_name) as | |
(select e.feature_id, e.name, | |
partof_term.name, | |
m.feature_id, m.name | |
from feature e, feature m, feature_relationship fr, | |
exon exon_term, mrna mrna_term, partof partof_term | |
where e.type_id = exon_term.cvterm_id | |
and m.type_id = mrna_term.cvterm_id | |
and fr.subject_id = e.feature_id | |
and fr.object_id = m.feature_id | |
and fr.type_id = partof_term.cvterm_id | |
), | |
-- collect exon feature_id's and count of mrna they're partof | |
exons(e_feature_id, count) as | |
(select e_feature_id, count(*) from features group by e_feature_id | |
), | |
-- snag the setset that are partof more than one mrna | |
reused_exons(e_feature_id, count) as | |
(select e_feature_id, count from exons where count > 1 | |
), | |
-- build a table the way we like it (aggresgate the mrna's) | |
output(exon_name, transcripts) as | |
(select f.e_name as exon_name, array_agg(f.m_name) as transcripts | |
from reused_exons r, features f | |
where r.e_feature_id = f.e_feature_id group by f.e_name | |
) | |
select * from output order by exon_name limit 100; | |
-- select count(*) from output; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment