waldeyr/Bioinformatics tricks

## bed_to_gff3.R
library(rtracklayer)
bed.ranges <- import.bed('file.bed')
export.gff3(bed.ranges,'file.gff3')

## Bioinformatics tricks
# Just some small codes to help myself and others...

## blast.md

      
    Raw
  

              blast.md
            
          
    Blast Runnig for selected taxonomy in NCBI databases

Download the database

Download the program "update_blastdb.pl"

wget https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/c%2B%2B/src/app/blast/update_blastdb.pl?view=co -O update_blastdb.pl
sudo mv update_blastdb.pl /usr/local/bin/
Download a blastdb from NCBI

Look at ftp://ftp.ncbi.nlm.nih.gov/blast/db/README to identify the database you want.
Let's say you are interested in the ITS_RefSeq_Fungi, which is the NCBI Internal transcribed spacer region (ITS) from Fungi type and reference material.
update_blastdb.pl ITS_RefSeq_Fungi
Limiting blast search to a taxonomic range

Install E-utilities

cd ~ && sh -c "$(curl -fsSL ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)"
Locating the taxnonomic id

get_species_taxids.sh -n Pleurotus

It will show the NCBI taxonomy ID for the Pleurotus genus which is 5320.

Creating a file with the taxonomy of interest

get_species_taxids.sh -t 5320 > Pleurotus_5320.txids
Runnig Blast over ITS database limited to the Pleurotus taxonomy

blastn -db ITS_RefSeq_Fungi -query query.fasta -taxidlist Pleurotus_5320.txids -outfmt 7 -out OUTPUT.tab

  
## collapse_reads_for_mirDeep2.py
import os
from Bio import SeqIO
import sys
'''
This script collapse reads in fastq format to fit the mirDeep2 input format.
It takes three arguments: the input reads, the output formated fasta file, a prefix of size=3 to name the experiment.
It has as dependencies the package Biopython version 1.78.
How touse it?
    `python3 fastq_to_fasta.py reads.fastq read.fasta EXP`
@author Waldeyr Mendes Cordeiro da Silva Dez-2020
Live Long and Prosper!
'''
def clear_sequence(_sequence):
    sequence = ''
    _sequence = _sequence.upper()
    for base in _sequence:
        if base in 'ATCGN':
            sequence += base
        else:
            sequence += 'N'
    return sequence

fastqFile = sys.argv[1]
fastaCollapsedFile = sys.argv[2]
readPrefix = str(sys.argv[3])
try:
    if os.path.exists(fastqFile):
        print("Parsing the fastq file...")
        records = SeqIO.parse(open(fastqFile), 'fastq')
        list_read_seq = []
        sequence_collapse = {}
        print("Collapsing reads...")
        for record in records:
            list_read_seq.append(str(record.seq).strip())
            if str(record.seq) not in sequence_collapse:
                sequence_collapse[str(record.seq)] = 0
            sequence_collapse[str(record.seq)] += 1
        print("Writing the mirDeep2 input file...")
        seqUniqueNumber = 1
        with open(fastaCollapsedFile, 'w') as writer:
            for seq, number in sequence_collapse.items():
                if len(seq) > 17:
                    # id = getMD5(seq)
                    seq = clear_sequence(seq)
                    writer.write(f">{readPrefix}_{seqUniqueNumber}_x{number}\n{seq}\n")
                    seqUniqueNumber = seqUniqueNumber + 1
        print("Done :-)")
    else:
        os.system(f'The file {fastqFile} was not found.')
except IOError:
    print('An exception occurred')

## conda_export_environment.sh
conda env export | grep -v "^prefix: " > environment.yml

## genbankGenerator.py
# Generating a Genbank File using Biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.SeqFeature import SeqFeature, FeatureLocation

sequences = ["GGGGAAAATTTTAAAACCCCAAAA","AAAATTTTAAAACCCCAAAAGGGG"]

listOfseqrecords = []

for sequence in sequences:
	seq = Seq(sequence, IUPAC.unambiguous_dna) # unambiguous_dna = only ACGT
	seqRecord = SeqRecord(
		seq,
		id='123', # sequence ID
		name='EXAMPLE', # Unique name (no blank spaces)
		description='Annotation for the sequence', # anotacao da enzima
		annotations= {"molecule_type":"cDNA", "date":"25-MAR-2019"}
		)
	qualifiers = {
		"EC_number":"9.9.9.9", # enzyme EC
		"info":"value" # other data
	}
	feature = SeqFeature(
		FeatureLocation( start=0, end=len(sequence) ),
		type='cDNA',
		location_operator='',
		strand=0, id=None,
		qualifiers=qualifiers,
		sub_features=None,
		ref=None,
		ref_db=None
	)
	seqRecord.features.append(feature)
	listOfseqrecords.append(seqRecord)

 	#Open the output file to write
	outputFile = open('example.gb', 'w')
	for seqRecord in listOfseqrecords:
		SeqIO.write(seqRecord, outputFile, 'genbank')

## Get fasta IDs
``grep ">" myfile.fasta | awk '{print substr($1,2)" "$2}' > temp.txt``

``grep -A 1 -e "derired_pattern" file.fa > desired_sequences.fa``

## phylogenetictree.py
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import AlignIO
from Bio.Phylo.Consensus import *

# Ler o alinhamento
msa = AlignIO.read('meu_alinhamento_multiplo.phy', 'phylip')
print msa

#####################################
# Neighbor Joinning SEM Booststrap
#####################################

# Calcular a matriz de distancias para o alinhamento
calculator = DistanceCalculator('identity') #pode ser 'blosum62' depende do que vc alinhou

matriz_distancia = calculator.get_distance(msa)
print('\nMatriz de distancia\n')
print(matriz_distancia)

# Construir a arvore com Neighbor Joinning
constructor = DistanceTreeConstructor()
tree = constructor.nj(matriz_distancia)

# Imprimir a arvore no terminal
print('\nArvore filogenetica com NJ\n')
Phylo.draw_ascii(tree)


#####################################
# COM Booststrap
#####################################

#Bootstrap alinhamento
msa_bootstrap = bootstrap(msa, 100)
constructor = DistanceTreeConstructor(calculator)
trees = bootstrap_trees(msa, 100, constructor.nj(matriz_distancia))
consensus_tree = bootstrap_consensus(msa, 100, constructor, majority_consensus)

# Imprimir a arvore no terminal
print('\nArvore filogenetica com NJ e Booststrap\n')
Phylo.draw_ascii(consensus_tree)

## randomDNA.py
from random import choice
DNA=""
length = 5000
for count in range(length):
	DNA+=choice("ACGT")
print(DNA)

## terpenSynthasesDomain.py
from Bio import SeqIO
import re

DDxxD = re.compile("(DD)[^D](Y|F)(D|R|G|H)")

NSE_DTE = re.compile("(D|N)D.{2}S.{3}E")

# 100% aa conservados
inter100 = re.compile(".{6}E.{3}F.{6}W.{8}P.{6,8}Y.{25,26}K.{12}E.{2}W.{10,13}E.{32,33}W.{4}P.{10}R.{2,3}")

path = "testFile.fasta"


# path = "betaCaryophyllene.fasta"

def getFastaFile(path: str):
    handle = open(path, "rU")
    return SeqIO.parse(handle, "fasta")


def betaCaryophyllenePredict(sequence: SeqIO):
    for record in sequence:
        matchDDxxD = DDxxD.search(str(record.seq))
        matchNSE_DTE = NSE_DTE.search(str(record.seq))
        if matchDDxxD is not None and matchNSE_DTE is not None:
            inter = record.seq[matchDDxxD.end():matchNSE_DTE.start()]
            if len(inter) >= 139 and len(inter) <= 144:
                if inter100.match(str(inter)):
                    print(record.id, record.seq)
        else:
            print(record.id, " is NOT a beta-caryophyllene synthase.")


betaCaryophyllenePredict(getFastaFile(path))

## Trimm fasta headers
# trimm fasta headers keeping only the ID
seqkit seq -i RprolixusV48.fa > out.fasta
	library(rtracklayer)
	bed.ranges <- import.bed('file.bed')
	export.gff3(bed.ranges,'file.gff3')
	import os
	from Bio import SeqIO
	import sys
	'''
	This script collapse reads in fastq format to fit the mirDeep2 input format.
	It takes three arguments: the input reads, the output formated fasta file, a prefix of size=3 to name the experiment.
	It has as dependencies the package Biopython version 1.78.
	How touse it?
	`python3 fastq_to_fasta.py reads.fastq read.fasta EXP`
	@author Waldeyr Mendes Cordeiro da Silva Dez-2020
	Live Long and Prosper!
	'''
	def clear_sequence(_sequence):
	sequence = ''
	_sequence = _sequence.upper()
	for base in _sequence:
	if base in 'ATCGN':
	sequence += base
	else:
	sequence += 'N'
	return sequence

	fastqFile = sys.argv[1]
	fastaCollapsedFile = sys.argv[2]
	readPrefix = str(sys.argv[3])
	try:
	if os.path.exists(fastqFile):
	print("Parsing the fastq file...")
	records = SeqIO.parse(open(fastqFile), 'fastq')
	list_read_seq = []
	sequence_collapse = {}
	print("Collapsing reads...")
	for record in records:
	list_read_seq.append(str(record.seq).strip())
	if str(record.seq) not in sequence_collapse:
	sequence_collapse[str(record.seq)] = 0
	sequence_collapse[str(record.seq)] += 1
	print("Writing the mirDeep2 input file...")
	seqUniqueNumber = 1
	with open(fastaCollapsedFile, 'w') as writer:
	for seq, number in sequence_collapse.items():
	if len(seq) > 17:
	# id = getMD5(seq)
	seq = clear_sequence(seq)
	writer.write(f">{readPrefix}_{seqUniqueNumber}_x{number}\n{seq}\n")
	seqUniqueNumber = seqUniqueNumber + 1
	print("Done :-)")
	else:
	os.system(f'The file {fastqFile} was not found.')
	except IOError:
	print('An exception occurred')
	# Generating a Genbank File using Biopython
	from Bio import SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from Bio.Alphabet import IUPAC
	from Bio.SeqFeature import SeqFeature, FeatureLocation

	sequences = ["GGGGAAAATTTTAAAACCCCAAAA","AAAATTTTAAAACCCCAAAAGGGG"]

	listOfseqrecords = []

	for sequence in sequences:
	seq = Seq(sequence, IUPAC.unambiguous_dna) # unambiguous_dna = only ACGT
	seqRecord = SeqRecord(
	seq,
	id='123', # sequence ID
	name='EXAMPLE', # Unique name (no blank spaces)
	description='Annotation for the sequence', # anotacao da enzima
	annotations= {"molecule_type":"cDNA", "date":"25-MAR-2019"}
	)
	qualifiers = {
	"EC_number":"9.9.9.9", # enzyme EC
	"info":"value" # other data
	}
	feature = SeqFeature(
	FeatureLocation( start=0, end=len(sequence) ),
	type='cDNA',
	location_operator='',
	strand=0, id=None,
	qualifiers=qualifiers,
	sub_features=None,
	ref=None,
	ref_db=None
	)
	seqRecord.features.append(feature)
	listOfseqrecords.append(seqRecord)

	#Open the output file to write
	outputFile = open('example.gb', 'w')
	for seqRecord in listOfseqrecords:
	SeqIO.write(seqRecord, outputFile, 'genbank')
	``grep ">" myfile.fasta \| awk '{print substr($1,2)" "$2}' > temp.txt``

	``grep -A 1 -e "derired_pattern" file.fa > desired_sequences.fa``
	from Bio import Phylo
	from Bio.Phylo.TreeConstruction import DistanceCalculator
	from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
	from Bio import AlignIO
	from Bio.Phylo.Consensus import *

	# Ler o alinhamento
	msa = AlignIO.read('meu_alinhamento_multiplo.phy', 'phylip')
	print msa

	#####################################
	# Neighbor Joinning SEM Booststrap
	#####################################

	# Calcular a matriz de distancias para o alinhamento
	calculator = DistanceCalculator('identity') #pode ser 'blosum62' depende do que vc alinhou

	matriz_distancia = calculator.get_distance(msa)
	print('\nMatriz de distancia\n')
	print(matriz_distancia)

	# Construir a arvore com Neighbor Joinning
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(matriz_distancia)

	# Imprimir a arvore no terminal
	print('\nArvore filogenetica com NJ\n')
	Phylo.draw_ascii(tree)


	#####################################
	# COM Booststrap
	#####################################

	#Bootstrap alinhamento
	msa_bootstrap = bootstrap(msa, 100)
	constructor = DistanceTreeConstructor(calculator)
	trees = bootstrap_trees(msa, 100, constructor.nj(matriz_distancia))
	consensus_tree = bootstrap_consensus(msa, 100, constructor, majority_consensus)

	# Imprimir a arvore no terminal
	print('\nArvore filogenetica com NJ e Booststrap\n')
	Phylo.draw_ascii(consensus_tree)
	from random import choice
	DNA=""
	length = 5000
	for count in range(length):
	DNA+=choice("ACGT")
	print(DNA)
	from Bio import SeqIO
	import re

	DDxxD = re.compile("(DD)[^D](Y\|F)(D\|R\|G\|H)")

	NSE_DTE = re.compile("(D\|N)D.{2}S.{3}E")

	# 100% aa conservados
	inter100 = re.compile(".{6}E.{3}F.{6}W.{8}P.{6,8}Y.{25,26}K.{12}E.{2}W.{10,13}E.{32,33}W.{4}P.{10}R.{2,3}")

	path = "testFile.fasta"


	# path = "betaCaryophyllene.fasta"

	def getFastaFile(path: str):
	handle = open(path, "rU")
	return SeqIO.parse(handle, "fasta")


	def betaCaryophyllenePredict(sequence: SeqIO):
	for record in sequence:
	matchDDxxD = DDxxD.search(str(record.seq))
	matchNSE_DTE = NSE_DTE.search(str(record.seq))
	if matchDDxxD is not None and matchNSE_DTE is not None:
	inter = record.seq[matchDDxxD.end():matchNSE_DTE.start()]
	if len(inter) >= 139 and len(inter) <= 144:
	if inter100.match(str(inter)):
	print(record.id, record.seq)
	else:
	print(record.id, " is NOT a beta-caryophyllene synthase.")


	betaCaryophyllenePredict(getFastaFile(path))
	# trimm fasta headers keeping only the ID
	seqkit seq -i RprolixusV48.fa > out.fasta