hartzell/exons-and-their-mrnas.sql

## exons-and-their-mrnas.sql
-- An exercise in Common Table Expessions.  There are probably
-- faster solutions....
-- Run this against Flybase and discover the exons that belong
-- to more than one transcripts.
--    exon_name    |    transcripts
-- 14-3-3epsilon:1 | {14-3-3epsilon-RB,14-3-3epsilon-RD}
-- 14-3-3epsilon:2 | {14-3-3epsilon-RA,14-3-3epsilon-RC}
-- 14-3-3epsilon:3 | {14-3-3epsilon-RA,14-3-3epsilon-RB,14-3-3...
with
  -- collect a bit of cvterm info
  exon(cvterm_id, name) as
    (select cvterm_id, name from cvterm where name = 'exon'),
  mrna(cvterm_id, name) as
    (select cvterm_id, name from cvterm where name = 'mRNA'),
  partof(cvterm_id, name) as
    (select cvterm_id, name from cvterm where name = 'partof'),
  -- table full of (exon, partof, mrna)-ish tuples
  features(e_feature_id, e_name, partof_term_name, m_feature_id, m_name) as
    (select e.feature_id, e.name,
            partof_term.name,
            m.feature_id, m.name
     from feature e, feature m, feature_relationship fr,
          exon exon_term, mrna mrna_term, partof partof_term
     where e.type_id = exon_term.cvterm_id
     and m.type_id = mrna_term.cvterm_id
     and fr.subject_id = e.feature_id
     and fr.object_id = m.feature_id
     and fr.type_id = partof_term.cvterm_id
    ),
  -- collect exon feature_id's and count of mrna they're partof
  exons(e_feature_id, count) as
    (select e_feature_id, count(*) from features group by e_feature_id
    ),
  -- snag the setset that are partof more than one mrna
  reused_exons(e_feature_id, count) as
    (select e_feature_id, count from exons where count > 1
    ),
  -- build a table the way we like it (aggresgate the mrna's)
  output(exon_name, transcripts) as
    (select f.e_name as exon_name, array_agg(f.m_name) as transcripts
     from reused_exons r, features f
     where r.e_feature_id = f.e_feature_id group by f.e_name
    )
select * from output order by exon_name limit 100;
--  select count(*) from output;
	-- An exercise in Common Table Expessions. There are probably
	-- faster solutions....
	-- Run this against Flybase and discover the exons that belong
	-- to more than one transcripts.
	-- exon_name \| transcripts
	-- 14-3-3epsilon:1 \| {14-3-3epsilon-RB,14-3-3epsilon-RD}
	-- 14-3-3epsilon:2 \| {14-3-3epsilon-RA,14-3-3epsilon-RC}
	-- 14-3-3epsilon:3 \| {14-3-3epsilon-RA,14-3-3epsilon-RB,14-3-3...
	with
	-- collect a bit of cvterm info
	exon(cvterm_id, name) as
	(select cvterm_id, name from cvterm where name = 'exon'),
	mrna(cvterm_id, name) as
	(select cvterm_id, name from cvterm where name = 'mRNA'),
	partof(cvterm_id, name) as
	(select cvterm_id, name from cvterm where name = 'partof'),
	-- table full of (exon, partof, mrna)-ish tuples
	features(e_feature_id, e_name, partof_term_name, m_feature_id, m_name) as
	(select e.feature_id, e.name,
	partof_term.name,
	m.feature_id, m.name
	from feature e, feature m, feature_relationship fr,
	exon exon_term, mrna mrna_term, partof partof_term
	where e.type_id = exon_term.cvterm_id
	and m.type_id = mrna_term.cvterm_id
	and fr.subject_id = e.feature_id
	and fr.object_id = m.feature_id
	and fr.type_id = partof_term.cvterm_id
	),
	-- collect exon feature_id's and count of mrna they're partof
	exons(e_feature_id, count) as
	(select e_feature_id, count(*) from features group by e_feature_id
	),
	-- snag the setset that are partof more than one mrna
	reused_exons(e_feature_id, count) as
	(select e_feature_id, count from exons where count > 1
	),
	-- build a table the way we like it (aggresgate the mrna's)
	output(exon_name, transcripts) as
	(select f.e_name as exon_name, array_agg(f.m_name) as transcripts
	from reused_exons r, features f
	where r.e_feature_id = f.e_feature_id group by f.e_name
	)
	select * from output order by exon_name limit 100;
	-- select count(*) from output;