timyates/code.R Secret

## code.R
# Find the probe hit locations relative to the 5' end of a translated transcript
#
# Things to note:
#
#   1) This will not return probes which hit across exon junctions.
#   2) The 'sequence' column will always return the sequence of the
#      probe, not the transcript sequence.  This should especially be
#      when looking at the reverse strand.

transcript.to.translatedprobes = function( transcript.ids ) {
  # Get all the exons for this transcript
  exons = transcript.to.exon( transcript.ids, as.vector=F )
  sapply( split( exons[['stable_id']], exons[['IN1']] ), function( exons ) {
    exons = exon.details( exons )
    fwd.strand = all( exons$strand == 1 )
    translated.width = sum( width( exons ) )
    # Go along each exon
    probes = sapply( seq_along( exons$stable_id ), function( eidx ) {
      # Get the details for this exon
      exon = exons[ eidx, ]

      # Find the probes which hit the same region as the exon
      probes = probes.in.range( as.character( space( exon ) ),
                                start( exon ),
                                end( exon ),
                                exon$strand,
                                as.vector=F )

      if( !is.null( probes ) ) {
        # Remove those that hang off the ends of the exon
        probes = probes[ start( probes ) >= start( exon ), ]
        probes = probes[ end( probes ) <= end( exon ), ]
      }

      # Return the probes for this exon
      probes
    } )

    # Then, change these offsets so that they are relative to the start of the
    # transcript
    exon.offset = 0
    for( idx in seq_along( exons$stable_id ) ) {
      # For the reverse strand, go through the list backwards
      eidx = if( fwd.strand ) idx else length( exons$stable_id ) - idx + 1
      if( !is.null( probes[[ eidx ]] ) ) {
        ranges( probes[[ eidx ]] ) = shift( ranges( probes[[ eidx ]] ),
                                           -start( exons[ eidx, ] ) + exon.offset )
        # If on the reverse strand, change offsets to 5' from 3'
        if( !fwd.strand ) {
          ranges( probes[[ eidx ]] ) = IRanges( start=translated.width - end( probes[[ eidx ]] ),
                                                end=translated.width - start( probes[[ eidx ]] ) )
          # Re-sort the probes by start
          probes[[ eidx ]] = probes[[ eidx ]][ order( start( probes[[ eidx ]] ) ), ]
        }
      }
      exon.offset = exon.offset + width( exons[ eidx, ] )
    }

    # Remove all NULLs and combine
    probes = do.call( 'rbind', probes[ !sapply( probes, is.null ) ] )
  } )
}

## results.R
> transcript.to.translatedprobes( c( 'ENST00000371953', 'ENST00000462694', 'ENST00000329958' ) )
$ENST00000329958
RangedData with 8 rows and 3 value columns across 1 space
        space     ranges |                  sequence probe_hit_count    strand
  <character>  <IRanges> |               <character>       <numeric> <integer>
1          10 [ 69,  93] | CCTCCCGGGTTCAAGCAATTCTCCT               2        -1
2          10 [ 70,  94] | CTCCCGGGTTCAAGCAATTCTCCTG               2        -1
3          10 [ 71,  95] | TCCCGGGTTCAAGCAATTCTCCTGC               2        -1
4          10 [ 75,  99] | GGGTTCAAGCAATTCTCCTGCCTCA               2        -1
5          10 [ 87, 111] | TTCTCCTGCCTCAGCCTCCCGAGTA               2        -1
6          10 [ 88, 112] | TCTCCTGCCTCAGCCTCCCGAGTAG               2        -1
7          10 [ 90, 114] | TCCTGCCTCAGCCTCCCGAGTAGCT               2        -1
8          10 [199, 223] | TTGAACTCCTGACCTCAAGTGATCC               2        -1

$ENST00000371953
RangedData with 110 rows and 3 value columns across 1 space
          space       ranges   |                  sequence probe_hit_count    strand
    <character>    <IRanges>   |               <character>       <numeric> <integer>
1            10   [  7,  31]   | TCAGACTCGAGTCAGTGACACTGCT               1         1
2            10   [ 29,  53]   | GCTCAACGCACCCATCTCAGCTTTC               1         1
3            10   [ 46,  70]   | CAGCTTTCATCATCAGTCCTCCACC               1         1
4            10   [ 90, 114]   | CCCTGCCTCCGGCTGGGTTTCTGGG               1         1
5            10   [444, 468]   | TGATGTGGCGGGACTCTTTATGCGC               1         1
6            10   [463, 487]   | ATGCGCTGCGGCAGGATACGCGCTC               1         1
7            10   [492, 516]   | CTGGGACGCGACTGCGCTCAGTTCT               1         1
8            10   [545, 569]   | AAGTTTGAGAGTTGAGCCGCTGTGA               1         1
9            10   [714, 738]   | CTCCTCTTCGTCTTTTCTAACCGTG               1         1
...         ...          ... ...                       ...             ...       ...
102          10 [5453, 5477]   | TGGTTGCCACAAAGTGCCTCGTTTA               1         1
103          10 [6137, 6161]   | AGCGTTCTAGTCCTGAAAAAGTCCA               1         1
104          10 [7445, 7469]   | CCTGGAGACATAGCCATCGTTAATA               1         1
105          10 [8061, 8085]   | GCTCTGAAAATTACCGCGTTTAGTA               1         1
106          10 [8624, 8648]   | TTGCCCTCATGTCCTAATGTGCAGT               1         1
107          10 [8924, 8948]   | TATATCAAGTTCCATTTGGCTACTG               1         1
108          10 [8939, 8963]   | TTGGCTACTGATGGACAAAAAATAG               1         1
109          10 [8961, 8985]   | TAGAAATGCCTTCCTATGGAGAGTA               1         1
110          10 [8972, 8996]   | TCCTATGGAGAGTATTTTTCCTTTA               1         1

$ENST00000462694
RangedData with 11 rows and 3 value columns across 1 space
         space     ranges |                  sequence probe_hit_count    strand
   <character>  <IRanges> |               <character>       <numeric> <integer>
1           10 [ 19,  43] | AGAGATCGTTAGCAGAAACAAAAGG               2         1
2           10 [ 33,  57] | GAAACAAAAGGAGATATCAAGAGGA               2         1
3           10 [ 46,  70] | ATATCAAGAGGATGGATTCGACTTA               2         1
4           10 [ 50,  74] | CAAGAGGATGGATTCGACTTAGACT               2         1
5           10 [ 56,  80] | GATGGATTCGACTTAGACTTGACCT               2         1
6           10 [121, 145] | AAGACTTGAAGGCGTATACAGGAAC               2         1
7           10 [137, 161] | TACAGGAACAATATTGATGATGTAG               2         1
8           10 [255, 279] | AGATACTTTGTGATGTAAACTATTA               1         1
9           10 [290, 314] | CTATAATCATTTTTTGGCTTACCGT               1         1
10          10 [305, 329] | GGCTTACCGTACCTAATGGACTTCA               1         1
11          10 [331, 355] | GGGGATACAGTTCATTTGATAAGAA               1         1
	# Find the probe hit locations relative to the 5' end of a translated transcript
	#
	# Things to note:
	#
	# 1) This will not return probes which hit across exon junctions.
	# 2) The 'sequence' column will always return the sequence of the
	# probe, not the transcript sequence. This should especially be
	# when looking at the reverse strand.

	transcript.to.translatedprobes = function( transcript.ids ) {
	# Get all the exons for this transcript
	exons = transcript.to.exon( transcript.ids, as.vector=F )
	sapply( split( exons[['stable_id']], exons[['IN1']] ), function( exons ) {
	exons = exon.details( exons )
	fwd.strand = all( exons$strand == 1 )
	translated.width = sum( width( exons ) )
	# Go along each exon
	probes = sapply( seq_along( exons$stable_id ), function( eidx ) {
	# Get the details for this exon
	exon = exons[ eidx, ]

	# Find the probes which hit the same region as the exon
	probes = probes.in.range( as.character( space( exon ) ),
	start( exon ),
	end( exon ),
	exon$strand,
	as.vector=F )

	if( !is.null( probes ) ) {
	# Remove those that hang off the ends of the exon
	probes = probes[ start( probes ) >= start( exon ), ]
	probes = probes[ end( probes ) <= end( exon ), ]
	}

	# Return the probes for this exon
	probes
	} )

	# Then, change these offsets so that they are relative to the start of the
	# transcript
	exon.offset = 0
	for( idx in seq_along( exons$stable_id ) ) {
	# For the reverse strand, go through the list backwards
	eidx = if( fwd.strand ) idx else length( exons$stable_id ) - idx + 1
	if( !is.null( probes[[ eidx ]] ) ) {
	ranges( probes[[ eidx ]] ) = shift( ranges( probes[[ eidx ]] ),
	-start( exons[ eidx, ] ) + exon.offset )
	# If on the reverse strand, change offsets to 5' from 3'
	if( !fwd.strand ) {
	ranges( probes[[ eidx ]] ) = IRanges( start=translated.width - end( probes[[ eidx ]] ),
	end=translated.width - start( probes[[ eidx ]] ) )
	# Re-sort the probes by start
	probes[[ eidx ]] = probes[[ eidx ]][ order( start( probes[[ eidx ]] ) ), ]
	}
	}
	exon.offset = exon.offset + width( exons[ eidx, ] )
	}

	# Remove all NULLs and combine
	probes = do.call( 'rbind', probes[ !sapply( probes, is.null ) ] )
	} )
	}
	> transcript.to.translatedprobes( c( 'ENST00000371953', 'ENST00000462694', 'ENST00000329958' ) )
	$ENST00000329958
	RangedData with 8 rows and 3 value columns across 1 space
	space ranges \| sequence probe_hit_count strand
	<character> <IRanges> \| <character> <numeric> <integer>
	1 10 [ 69, 93] \| CCTCCCGGGTTCAAGCAATTCTCCT 2 -1
	2 10 [ 70, 94] \| CTCCCGGGTTCAAGCAATTCTCCTG 2 -1
	3 10 [ 71, 95] \| TCCCGGGTTCAAGCAATTCTCCTGC 2 -1
	4 10 [ 75, 99] \| GGGTTCAAGCAATTCTCCTGCCTCA 2 -1
	5 10 [ 87, 111] \| TTCTCCTGCCTCAGCCTCCCGAGTA 2 -1
	6 10 [ 88, 112] \| TCTCCTGCCTCAGCCTCCCGAGTAG 2 -1
	7 10 [ 90, 114] \| TCCTGCCTCAGCCTCCCGAGTAGCT 2 -1
	8 10 [199, 223] \| TTGAACTCCTGACCTCAAGTGATCC 2 -1

	$ENST00000371953
	RangedData with 110 rows and 3 value columns across 1 space
	space ranges \| sequence probe_hit_count strand
	<character> <IRanges> \| <character> <numeric> <integer>
	1 10 [ 7, 31] \| TCAGACTCGAGTCAGTGACACTGCT 1 1
	2 10 [ 29, 53] \| GCTCAACGCACCCATCTCAGCTTTC 1 1
	3 10 [ 46, 70] \| CAGCTTTCATCATCAGTCCTCCACC 1 1
	4 10 [ 90, 114] \| CCCTGCCTCCGGCTGGGTTTCTGGG 1 1
	5 10 [444, 468] \| TGATGTGGCGGGACTCTTTATGCGC 1 1
	6 10 [463, 487] \| ATGCGCTGCGGCAGGATACGCGCTC 1 1
	7 10 [492, 516] \| CTGGGACGCGACTGCGCTCAGTTCT 1 1
	8 10 [545, 569] \| AAGTTTGAGAGTTGAGCCGCTGTGA 1 1
	9 10 [714, 738] \| CTCCTCTTCGTCTTTTCTAACCGTG 1 1
	... ... ... ... ... ... ...
	102 10 [5453, 5477] \| TGGTTGCCACAAAGTGCCTCGTTTA 1 1
	103 10 [6137, 6161] \| AGCGTTCTAGTCCTGAAAAAGTCCA 1 1
	104 10 [7445, 7469] \| CCTGGAGACATAGCCATCGTTAATA 1 1
	105 10 [8061, 8085] \| GCTCTGAAAATTACCGCGTTTAGTA 1 1
	106 10 [8624, 8648] \| TTGCCCTCATGTCCTAATGTGCAGT 1 1
	107 10 [8924, 8948] \| TATATCAAGTTCCATTTGGCTACTG 1 1
	108 10 [8939, 8963] \| TTGGCTACTGATGGACAAAAAATAG 1 1
	109 10 [8961, 8985] \| TAGAAATGCCTTCCTATGGAGAGTA 1 1
	110 10 [8972, 8996] \| TCCTATGGAGAGTATTTTTCCTTTA 1 1

	$ENST00000462694
	RangedData with 11 rows and 3 value columns across 1 space
	space ranges \| sequence probe_hit_count strand
	<character> <IRanges> \| <character> <numeric> <integer>
	1 10 [ 19, 43] \| AGAGATCGTTAGCAGAAACAAAAGG 2 1
	2 10 [ 33, 57] \| GAAACAAAAGGAGATATCAAGAGGA 2 1
	3 10 [ 46, 70] \| ATATCAAGAGGATGGATTCGACTTA 2 1
	4 10 [ 50, 74] \| CAAGAGGATGGATTCGACTTAGACT 2 1
	5 10 [ 56, 80] \| GATGGATTCGACTTAGACTTGACCT 2 1
	6 10 [121, 145] \| AAGACTTGAAGGCGTATACAGGAAC 2 1
	7 10 [137, 161] \| TACAGGAACAATATTGATGATGTAG 2 1
	8 10 [255, 279] \| AGATACTTTGTGATGTAAACTATTA 1 1
	9 10 [290, 314] \| CTATAATCATTTTTTGGCTTACCGT 1 1
	10 10 [305, 329] \| GGCTTACCGTACCTAATGGACTTCA 1 1
	11 10 [331, 355] \| GGGGATACAGTTCATTTGATAAGAA 1 1