Skip to content

Instantly share code, notes, and snippets.

@finswimmer
finswimmer / gff_subset.py
Last active May 16, 2018 12:31
Answer on biostar question 314928
from BCBio import GFF
in_file = "test.gff"
out_file = "test.parse.gff3"
limit_info = {
'gff_id': ["1"], # chromosome 1
'gff_type': ["ncRNA"] # get non coding RNA
}
with open(out_file, "w") as out_handle, open(in_file) as in_handle:
import gzip
reads = 0
bases = 0
with gzip.open('your.fastq.gz', 'rb') as read:
for id in read:
seq = next(read)
reads += 1
bases += len(seq.strip())
import requests
import sys
import argparse
def set_args():
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group()
group.add_argument("-t", "--transcript", help="ensembl's transcript id")
group.add_argument("-l", "--list", help="file which contains list of transcript id's")
import sys
from pysam import VariantFile
vcf_in = VariantFile(sys.argv[1])
print(str(vcf_in.header).strip())
for rec in vcf_in:
for key, value in rec.info.items():
if vcf_in.header.info[key].number not in ("A", "R", "G", ".", "0"):
@finswimmer
finswimmer / one2three.py
Last active April 8, 2021 02:58
Replace amino acid 1-letter code with 3-letter code in annovar output
import sys
def replace(text, replacement):
new = ''
for c in text:
try:
new += replacement[c]
except KeyError:
@finswimmer
finswimmer / join.py
Last active June 16, 2018 15:03
Join multiple pandas dataframes
import glob
import sys
import pandas
def read_filenames(names):
for arg in names:
if "*" in arg:
for file in glob.glob(arg):
yield file
import sys
with open(sys.argv[1], "r") as gtf:
header = next(gtf)
last_gene = {
"gene": None,
"chr": None,
"end": None,
}
import sys
from Bio import SeqIO
def mismatch(seq1, seq2):
index = []
for i, (s1, s2) in enumerate(zip(seq1, seq2)):
if s1 != s2:
index.append(i)
import sys
from Bio import SeqIO
def mismatch(seq1, seq2):
index = []
for i, (s1, s2) in enumerate(zip(seq1, seq2)):
if s1 != s2:
index.append(i)
@finswimmer
finswimmer / prefixes.txt
Created June 30, 2018 05:35
Ensembl stable ID species prefixes
ENSPFO Poecilia formosa (Amazon molly)
ENSJJA Jaculus jaculus (Lesser Egyptian jerboa)
ENSPCO Propithecus coquereli (Coquerel's sifaka)
ENSNGA Nannospalax galili (Upper Galilee mountains blind mole rat)
ENSMFA Macaca fascicularis (Crab-eating macaque)
ENSMIC Microcebus murinus (Mouse Lemur)
MGP_CAROLIEiJ_ Mus caroli (Ryukyu mouse)
ENSFAL Ficedula albicollis (Flycatcher)
ENSCLA Chinchilla lanigera (Long-tailed chinchilla)
ENSPEM Peromyscus maniculatus bairdii (Northern American deer mouse)