Skip to content

Instantly share code, notes, and snippets.

@tice0-2
Created March 23, 2020 04:15
Show Gist options
  • Save tice0-2/e5853ce3299ee5f2aa9a6c3ce7dc2760 to your computer and use it in GitHub Desktop.
Save tice0-2/e5853ce3299ee5f2aa9a6c3ce7dc2760 to your computer and use it in GitHub Desktop.

repeat_master (a nameplay on RepeatMasker, never thought anyone else excpet me would use it so sorry about the name, feel free to rename it...) is a simple wrapper for querying the mice database with repeat master.

Simple Usage

from repeat_master import repeat_master
repeat_master(list of sequences) # returns a list of results for each sequence, corresponding to the highest score entry from repeatmasker

For example:

repeat_master(['AAAATACCTTGGCATGACTCTAACTAAGGAAGTGAAAGATCTGTA'])
# => [[25, 2.2, 0.0, 0.0, 0, 1, 45, '(0)', '+', 'L1MdF_III', 'LINE/L1', '4378', '4422', '(2154)', 1]]
from os import system
import os
import io
RM_PATH = '/csbiohome02/mcmillan/Notebooks/Charles/yet-summer-again/repeat-masker/RepeatMasker/RepeatMasker'
COLUMNS = ['score','div', 'del', 'ins', 'query', 'begin', 'end', 'left', 'sgn', 'repeat', 'family', 'begin', 'end', 'left', 'id']
def parse_repeatmasker(fp):
for i, l in enumerate(fp):
if i <= 2:
continue
toks = l.split()
yield toks
identity = lambda x: x
def apply_elements(f, coll):
return (g(e) for g, e in zip(f, coll))
def repeat_master_(seqs):
with open('in.fasta.out', 'w') as fp:
os.remove('in.fasta.out')
system('rm -rf *.RMoutput')
res = [None] * len(seqs)
buf = ''
for i, s in enumerate(seqs):
buf += '>%06d\n' % i
buf += s
buf += '\n'
with open('in.fasta', 'w+') as fp:
fp.write(buf)
system('{} -q -species \'mouse\' in.fasta > /dev/null'.format(RM_PATH))
with open('in.fasta.out') as fp:
for r in parse_repeatmasker(fp):
dtypes = [int, float, float, float, int, int, int, str, str, str, str, str, str, str, int]
r = list(apply_elements(dtypes,r))
res[r[4]] = r
return res
from itertools import chain, imap
def flatmap(f, items):
return chain.from_iterable(imap(f, items))
def groups_in(it, n):
cache = []
for i in it:
cache.append(i)
if len(cache) == n:
yield cache
cache = []
if cache:
yield cache
def batch_map(fn, seq, n):
return flatmap(fn, groups_in(seq, n))
def repeat_master(seqs):
return batch_map(repeat_master_, seqs, 5000)
if __name__ == "__main__":
r = repeat_master_(['AAAATACCTTGGCATGACTCTAACTAAGGAAGTGAAAGATCTGTA'])
print(r)
r = repeat_master(['AAAATACCTTGGCATGACTCTAACTAAGGAAGTGAAAGATCTGTA'])
print(list(r))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment