Skip to content

Instantly share code, notes, and snippets.

@waldeyr
Last active December 12, 2023 09:56
Show Gist options
  • Save waldeyr/acd853b53ba238878ff935e5227a2430 to your computer and use it in GitHub Desktop.
Save waldeyr/acd853b53ba238878ff935e5227a2430 to your computer and use it in GitHub Desktop.
library(rtracklayer)
bed.ranges <- import.bed('file.bed')
export.gff3(bed.ranges,'file.gff3')
# Just some small codes to help myself and others...

Blast Runnig for selected taxonomy in NCBI databases

Download the database

Download the program "update_blastdb.pl"

wget https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/c%2B%2B/src/app/blast/update_blastdb.pl?view=co -O update_blastdb.pl

sudo mv update_blastdb.pl /usr/local/bin/

Download a blastdb from NCBI

Look at ftp://ftp.ncbi.nlm.nih.gov/blast/db/README to identify the database you want.

Let's say you are interested in the ITS_RefSeq_Fungi, which is the NCBI Internal transcribed spacer region (ITS) from Fungi type and reference material.

update_blastdb.pl ITS_RefSeq_Fungi

Limiting blast search to a taxonomic range

Install E-utilities

cd ~ && sh -c "$(curl -fsSL ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)"

Locating the taxnonomic id

get_species_taxids.sh -n Pleurotus

  • It will show the NCBI taxonomy ID for the Pleurotus genus which is 5320.

Creating a file with the taxonomy of interest

get_species_taxids.sh -t 5320 > Pleurotus_5320.txids

Runnig Blast over ITS database limited to the Pleurotus taxonomy

blastn -db ITS_RefSeq_Fungi -query query.fasta -taxidlist Pleurotus_5320.txids -outfmt 7 -out OUTPUT.tab

import os
from Bio import SeqIO
import sys
'''
This script collapse reads in fastq format to fit the mirDeep2 input format.
It takes three arguments: the input reads, the output formated fasta file, a prefix of size=3 to name the experiment.
It has as dependencies the package Biopython version 1.78.
How touse it?
`python3 fastq_to_fasta.py reads.fastq read.fasta EXP`
@author Waldeyr Mendes Cordeiro da Silva Dez-2020
Live Long and Prosper!
'''
def clear_sequence(_sequence):
sequence = ''
_sequence = _sequence.upper()
for base in _sequence:
if base in 'ATCGN':
sequence += base
else:
sequence += 'N'
return sequence
fastqFile = sys.argv[1]
fastaCollapsedFile = sys.argv[2]
readPrefix = str(sys.argv[3])
try:
if os.path.exists(fastqFile):
print("Parsing the fastq file...")
records = SeqIO.parse(open(fastqFile), 'fastq')
list_read_seq = []
sequence_collapse = {}
print("Collapsing reads...")
for record in records:
list_read_seq.append(str(record.seq).strip())
if str(record.seq) not in sequence_collapse:
sequence_collapse[str(record.seq)] = 0
sequence_collapse[str(record.seq)] += 1
print("Writing the mirDeep2 input file...")
seqUniqueNumber = 1
with open(fastaCollapsedFile, 'w') as writer:
for seq, number in sequence_collapse.items():
if len(seq) > 17:
# id = getMD5(seq)
seq = clear_sequence(seq)
writer.write(f">{readPrefix}_{seqUniqueNumber}_x{number}\n{seq}\n")
seqUniqueNumber = seqUniqueNumber + 1
print("Done :-)")
else:
os.system(f'The file {fastqFile} was not found.')
except IOError:
print('An exception occurred')
conda env export | grep -v "^prefix: " > environment.yml
# Generating a Genbank File using Biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.SeqFeature import SeqFeature, FeatureLocation
sequences = ["GGGGAAAATTTTAAAACCCCAAAA","AAAATTTTAAAACCCCAAAAGGGG"]
listOfseqrecords = []
for sequence in sequences:
seq = Seq(sequence, IUPAC.unambiguous_dna) # unambiguous_dna = only ACGT
seqRecord = SeqRecord(
seq,
id='123', # sequence ID
name='EXAMPLE', # Unique name (no blank spaces)
description='Annotation for the sequence', # anotacao da enzima
annotations= {"molecule_type":"cDNA", "date":"25-MAR-2019"}
)
qualifiers = {
"EC_number":"9.9.9.9", # enzyme EC
"info":"value" # other data
}
feature = SeqFeature(
FeatureLocation( start=0, end=len(sequence) ),
type='cDNA',
location_operator='',
strand=0, id=None,
qualifiers=qualifiers,
sub_features=None,
ref=None,
ref_db=None
)
seqRecord.features.append(feature)
listOfseqrecords.append(seqRecord)
#Open the output file to write
outputFile = open('example.gb', 'w')
for seqRecord in listOfseqrecords:
SeqIO.write(seqRecord, outputFile, 'genbank')
``grep ">" myfile.fasta | awk '{print substr($1,2)" "$2}' > temp.txt``
``grep -A 1 -e "derired_pattern" file.fa > desired_sequences.fa``
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import AlignIO
from Bio.Phylo.Consensus import *
# Ler o alinhamento
msa = AlignIO.read('meu_alinhamento_multiplo.phy', 'phylip')
print msa
#####################################
# Neighbor Joinning SEM Booststrap
#####################################
# Calcular a matriz de distancias para o alinhamento
calculator = DistanceCalculator('identity') #pode ser 'blosum62' depende do que vc alinhou
matriz_distancia = calculator.get_distance(msa)
print('\nMatriz de distancia\n')
print(matriz_distancia)
# Construir a arvore com Neighbor Joinning
constructor = DistanceTreeConstructor()
tree = constructor.nj(matriz_distancia)
# Imprimir a arvore no terminal
print('\nArvore filogenetica com NJ\n')
Phylo.draw_ascii(tree)
#####################################
# COM Booststrap
#####################################
#Bootstrap alinhamento
msa_bootstrap = bootstrap(msa, 100)
constructor = DistanceTreeConstructor(calculator)
trees = bootstrap_trees(msa, 100, constructor.nj(matriz_distancia))
consensus_tree = bootstrap_consensus(msa, 100, constructor, majority_consensus)
# Imprimir a arvore no terminal
print('\nArvore filogenetica com NJ e Booststrap\n')
Phylo.draw_ascii(consensus_tree)
from random import choice
DNA=""
length = 5000
for count in range(length):
DNA+=choice("ACGT")
print(DNA)
from Bio import SeqIO
import re
DDxxD = re.compile("(DD)[^D](Y|F)(D|R|G|H)")
NSE_DTE = re.compile("(D|N)D.{2}S.{3}E")
# 100% aa conservados
inter100 = re.compile(".{6}E.{3}F.{6}W.{8}P.{6,8}Y.{25,26}K.{12}E.{2}W.{10,13}E.{32,33}W.{4}P.{10}R.{2,3}")
path = "testFile.fasta"
# path = "betaCaryophyllene.fasta"
def getFastaFile(path: str):
handle = open(path, "rU")
return SeqIO.parse(handle, "fasta")
def betaCaryophyllenePredict(sequence: SeqIO):
for record in sequence:
matchDDxxD = DDxxD.search(str(record.seq))
matchNSE_DTE = NSE_DTE.search(str(record.seq))
if matchDDxxD is not None and matchNSE_DTE is not None:
inter = record.seq[matchDDxxD.end():matchNSE_DTE.start()]
if len(inter) >= 139 and len(inter) <= 144:
if inter100.match(str(inter)):
print(record.id, record.seq)
else:
print(record.id, " is NOT a beta-caryophyllene synthase.")
betaCaryophyllenePredict(getFastaFile(path))
# trimm fasta headers keeping only the ID
seqkit seq -i RprolixusV48.fa > out.fasta
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment