Skip to content

Instantly share code, notes, and snippets.

@fomightez
Last active November 12, 2020 19:27
Show Gist options
  • Save fomightez/8cd6d9ba88f975b64e43eba562894dec to your computer and use it in GitHub Desktop.
Save fomightez/8cd6d9ba88f975b64e43eba562894dec to your computer and use it in GitHub Desktop.
snippets for dealing with FASTA
# This is not presently all encompassing as it was started well after my sequence work repo
# at https://github.com/fomightez/sequencework , where much of this related code is.
# For making FASTA files/entriees out of dataframes, see 'specific dataframe contents saved as formatted text file example'
# in my useful pandas snippets gist https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4
# For reading in FASTA files/entries
# -See `mine_mito_features` function in `mine_mito_features_from_transcriptome.py`
# https://github.com/fomightez/sequencework/blob/master/Extract_Details_or_Annotation/mine_mito_features_from_transcriptome.py
# -See `get_fasta_seq` and `get_seq_from_URL` functions in `find_sequence_element_occurrences_in_sequence.py` in
# the https://github.com/fomightez/sequencework/tree/master/FindSequence repo
# (direct url: https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py )
# From `patmatch-binder`, `specifically PatMatch nucleic handling flags demystified.ipynb`, and
# highly based on similar handling in `find_sequence_element_occurrences_in_sequence.py`
# (https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py) &
# now adapted into a proper (slightly better) script with command line provideable arguments at
# https://github.com/fomightez/sequencework/blob/master/ConvertSeq/convert_fasta_to_reverse_complement.py :
## READ IN FASTA FILE AND CONVERT TO REVERSE COMPLEMENT USING BIOPYTHON **
from Bio import SeqIO
from Bio.Seq import Seq # for reverse complement
def get_seq_from_file(file_name):
'''
takes a file name and gets the sequence RECORD.
'''
fasta_iterator = SeqIO.parse(file_name, "fasta")
# Use of `next()` on next line to get first FASTA -formatted sequence is
# based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
# I think difference from `SeqIO.read()` in this approach is that it won't
# give an error if more than one entry is in the file.
record = next(fasta_iterator)
#return record.seq
#return record.id
return record
# Read FASTA file
input_file_name = "chr15.fsa"
sequence_record = get_seq_from_file(input_file_name)
#print(sequence_record) #FOR DEBUGGING
# Read multi FASTA file ((in other words single FASTA file with multiple sequences/records multiFASTA)
records = []
for record in SeqIO.parse("TAIR9_pep_20090619.fa", "fasta"):
records.append(record)
thale_cress_prot_length = []
for record in records:
#print (len(record.seq))
thale_cress_prot_length.append(len(record.seq))
# Get reverse complement
seq_rev_compl_record = sequence_record.reverse_complement()
seq_rev_compl_record.id = sequence_record.id #record needs id for writing FASTA
# or even easier, see https://biopython.org/DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html#reverse_complement
# and use argruments to keep id and description when reverse complementing
seq_rev_compl_record = sequence_record.reverse_complement(id=True,description=True)
#print(seq_rev_compl_record) #FOR DEBUGGING
# Save FASTA file for reverse complement
output_file_name = "chr15.revcompl.fa"
SeqIO.write(seq_rev_compl_record, output_file_name, "fasta");
# Save multi fasta (in other words single FASTA file with multiple sequences/records)
SeqIO.write(records,file_name, "fasta");
# based on https://www.biostars.org/p/48797/#48801 "If that's the case then
# note SeqIO.write() can take a list or a generator of SeqRecords so you should pass one of those"
# related to above , but dealing with URL source or file too:
def get_seq_from_URL(url):
'''
takes a URL and gets the sequence
'''
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
# Getting html originally for just Python 3, adapted from
# https://stackoverflow.com/a/17510727/8508004 and then updated from to
# handle Python 2 and 3 according to same link.
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
html = urlopen(url)
fasta_iterator = SeqIO.parse(StringIO(
html.read().decode(encoding='UTF-8')), "fasta")
# Use of `next()` on next line to get first FASTA -formatted sequence is
# based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
# I think difference from `SeqIO.read()` in this approach is that it won't
# give an error if more than one entry is in the html.
# I found I needed `StringIO()` or got issues with trying to handle long file name.
record = next(fasta_iterator)
return record.seq
def get_fasta_seq(source):
'''
Takes a source URL or filepath/ file name and gets sequence if it is in
FASTA format.
It won't return anything if what is provided isn't FASTA format because
Biopython handles both trying to extract FASTA from URL and from file.
See https://stackoverflow.com/a/44294079/8508004.
Placing it in a function it easy to then check and provide some feedback.
'''
if source.lower().startswith("http"):
return get_seq_from_URL(source)
else:
# Read sequence, treating source as a filepath.
# Use of `with` on next line based on http://biopython.org/wiki/SeqIO ,
# under "Sequence Input". Otherwise, backbone based on
# https://www.biostars.org/p/209383/, and fact `rU` mode depecated.
with open(source, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
# print(record.seq) # for debugging
return record.seq
# remove gaps and make a string into a SeqRecord that can be written to FASTA with `SeqIO.write(records,file_name, "fasta")`
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
new_record = SeqRecord(Seq(text_string, generic_dna).ungap("-"), id= id_, description=id_descript))# based
# on https://www.biostars.org/p/48797/ and `.ungap()` method, see
# https://github.com/biopython/biopython/issues/1511 , and `description`
# from what I've seen for `id` plus https://biopython.org/wiki/SeqIO
# Note that you can leave out `id=` part and just put the necessary string in that
# position in function call because it is a required positoinal argument
#If the 'sequence' is already a Bio.Seq.Seq object, it can be made into a record with an id, or optionnally also a description
new_record = SeqRecord(seq_obj, "XS56782"))
#-or-
new_record = SeqRecord(seq_obj, id= "XS56782", description=id_descript))
#note that a `record.decscription` will also contain the `record.id`; can be removed with `description_wo_id=record.description.split(record.id,1)[1]` so it won't show up twice
# Editing description line of FASTA from within a jupyter notebooks cell
# add identifiers to each `chr` so results for each strain clear later
chromosome_id_prefix = "chr"
def add_strain_id_to_description_line(file,strain_id):
'''
Takes a file and edits every description line to add
strain_id after the caret.
Saves the fixed file
'''
import sys
output_file_name = "temp.txt"
# prepare output file for saving so it will be open and ready
with open(output_file_name, 'w') as output_file:
# read in the input file
with open(file, 'r') as input_handler:
# prepare to give feeback later or allow skipping to certain start
lines_processed = 0
for line in input_handler:
lines_processed += 1
if line.startswith(">"):
rest_o_line = line.split(">")
new_line = ">"+strain_id + rest_o_line[1]
else:
new_line = line
# Send text to output
output_file.write(new_line)
# replace the original file with edited
!mv temp.txt {file}
# Feedback
sys.stderr.write("\n{} chromosome identifiers tagged.".format(file))
for s in yue_et_al_strains:
add_strain_id_to_description_line(s+".genome.fa",s)
#function to get description line text from a FASTA file
def get_descriptionline(filename):
'''
Takes the name of file in FASTA format and gets the first description line,
i.e., the text after the caret on the first line.
Returns the text after the '>' symbol.
'''
import sys
output_file_name = "temp.txt"
# prepare output file for saving so it will be open and ready
with open(output_file_name, 'w') as output_file:
# read in the input file
with open(filename, 'r') as input_handler:
# prepare to give feeback later or allow skipping to certain start
lines_processed = 0
for line in input_handler:
lines_processed += 1
if lines_process == 1 and line.startswith(">"):
return line.split(">")[0].strip()
# save the protein sequence as FASTA
chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to
# multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does)
# Note that I generalized the chunking to a function that can handle any type of
# residue in `get_seq_from_multiFASTA_with_match_in_description.py`
prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range(
0, len(prot_seq),chunk_size)]
prot_seq_fa = ">" + prot_descr + "\n"+ "\n".join(prot_seq_chunks)
p_output_file_name = strain_id +"_" + gene_name + "_protein_ortholog.fa"
with open(p_output_file_name, 'w') as output:
output.write(prot_seq_fa)
prot_seqs_fn_list.append(p_output_file_name)
sys.stderr.write("\n\nProtein sequence saved as "
"`{}`.".format(p_output_file_name))
# above code made into function for working with pyfaidx; used in `blast-binder` repo to make it easier for users
# who already had a multi-FASTA sequence file of sequeneces of interest to plug into a workflow at a later
# point
def seq_to_file(prot_seq, description, name_suffix_to_use):
'''
function to save a sequence as a file in FASTA-format
'''
# save the protein sequence as FASTA
chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to
# multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does)
# Note that I generalized the chunking to a function that can handle any type of
# residue in `get_seq_from_multiFASTA_with_match_in_description.py`
prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range(
0, len(prot_seq),chunk_size)]
prot_seq_fa = ">" + description + "\n"+ "\n".join(prot_seq_chunks)
p_output_file_name = description.split()[0] + name_suffix_to_use
with open(p_output_file_name, 'w') as output:
output.write(prot_seq_fa)
sys.stderr.write("\n\nProtein sequence saved as "
"`{}`.".format(p_output_file_name))
from pyfaidx import Fasta
records = Fasta(sequence)
for seq in records:
# make individual file (also see `!faidx --split-files {seq_file}` below)
seq_to_file(str(seq), seq.long_name, "_protein_ortholog.fa") # I am using `seq.long_name` here because it give more options for adapting the code to make a file name one prefers; however, in developing some other code I became aware that if the FASTA files are non-standard and have an empty line above the description line, that `seq.long_name` will included the caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name, just change here to `seq.name`
# do SOMETHING WITH THAT FILE
'''
BLOCK TO DO SOMETHING HERE
'''
# One metric to assess is count all the letters present (because shows number of Ns too)
# [BASIC VERSION]
from pyfaidx import Fasta
import collections
for g in genomes:
concatenated_seqs = ""
chrs = Fasta(g)
for x in chrs:
#print(x.name)
concatenated_seqs += str(x)
print(g,":",collections.Counter(concatenated_seqs))
# ['EXPANDED' VERSION]
from pyfaidx import Fasta
import pandas as pd
import collections
nt_counts = {}
for g in genomes:
strain_id = g.split(".genome.fa")[0]
concatenated_seqs = ""
chrs = Fasta(g)
for x in chrs:
#print(x.name)
concatenated_seqs += str(x)
nt_counts[strain_id] = collections.Counter(concatenated_seqs)
nt_count_df = pd.DataFrame.from_dict(nt_counts, orient='index').fillna(0)
nt_count_df["Total_nts"] = nt_count_df.sum(1)
def percent_calc(items):
'''
takes a list of two items and calculates percentage of first item
within total (second item)
'''
return items[0]/items[1]
nt_count_df['% N'] = nt_count_df[['N','Total_nts']].apply(percent_calc, axis=1)
nt_count_df = nt_count_df.style.format({'Total_nts':'{:.2E}','% N':'{:.2%}'})
# Alternatives for handling description lines, i.e.,lines before sequence beginning with `>`, with pyfaidx:
# By default, `.name` attribute just goes to first encountered space in entry description line. You can access full
# description with `.long_name` attribute. <===NOTE while developing some other code I became aware that if the FASTA
# files are non-standard and have an empty line above the description line, that `seq.long_name` will included the
# caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name,
# it may be easiest to just change to `seq.name` or strip off `>` if the starts with `>` because the first one won't have it.
from pyfaidx import Fasta
genes = Fasta('tests/data/genes.fasta')
for record in genes:
print(record.name)
print(record.long_name)
# new in v0.4.9, you can specify the `.name` attribute be the full description line if you specify `read_long_names=True`.
from pyfaidx import Fasta
genes = Fasta('tests/data/genes.fasta', read_long_names=True)
for record in genes:
print(record.name)
# You can also filter out based on the contents of the description line using 'filt_function'. The have
# simple example in the documentation now that filters on first letter in that line. Below is fancier
# where filters records where description line ends in patterns like '0-0' or '4182-4182'.
seqrecords = Fasta(fn, read_long_names=True,
filt_function = lambda x: x.split(
"extracted_coordinates: ")[1].split("-")[0] != x.split(
"extracted_coordinates: ")[1].split("-")[1])
# Note that in this case I had details on the sequence in the description line. This won't always
# be the case so I could just use an if statement after that to conditionally skip those where
# sequence in the record isn't greater than 1, example `if len(record) > 1:`. I wanted to avoid
# adding yet another if conditional to the processing I was doing where I chose to use `filt_function` to
# filter. It seems I could also interate on the `Fasta`
# object only collecting to a new list the records where the length of the sequence was greater than 1 and then
# iterate over that list of records. Those should be fine for iterating even if I haven't figured out
# how to cast the list of records (each `<class 'pyfaidx.FastaRecord'>`) back to the the `class 'pyfaidx.Fasta'`
# count frequency of blocks of Ns in a string (presumably sequence)
import re
from collections import defaultdict
t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN"
matches = []
len_match_dict = defaultdict(int)
min_number_Ns_in_row_to_collect = 1
pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I) # adpated from
# code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on
# https://stackoverflow.com/a/250306/8508004
for m in pattern_obj.finditer(t):
len_match_dict[len(m.group())] += 1
matches.append(m.group())
print(len_match_dict)
print(collections.Counter(matches))
# Split a file using pyfaidx command line utility that is available in Jupyter when install pyfaidx module
seq_file = "SGDs288CplusPacBio_ADJUSTEDplusWoltersnW303forALIGNERS.fa"
!faidx --split-files {seq_file}
# Package up a lot of sequence files (FASTA) into one archive for easy downloading , PLUS DEMONSTRATES
# merging a lot of inidvidual sequence files into one FASTA file (for Jupyter session)
#Archive the sequences (FASTA format) collected and any dataframes made
# Pickle each dataframe and also save as `tsv` for possible use elsewhere
strd_dataframes_fn_list = []
def pickle_df_and_store_as_table(dataframe, prefix):
'''
Take a dataframe and a filename prefix and save a pickled form of that
dataframe and a text tablular data version (tab-sepearated values).
Returns the name of the pickled and text file.
'''
dataframe.to_pickle(prefix + ".pkl")
dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
return prefix + ".pkl", prefix + ".tsv"
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
# associated as the value
df_n_fn_dict = {
"CTD_seq_of_protein_orthologs": CTD_seq_df,
"first_heptad_of_protein_orthologs": first_7_df,
"heptads_ofCTD_seq_of_protein_orthologs": repeat_df,
"main_heptads_ofCTD_seq_of_protein_orthologs": repeat_wo_first_df,
"fraction_matching_consensus_per_CTD": fraction_consensus_df,
}
import pandas as pd
for prefix, dataframe in df_n_fn_dict.items():
#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))
# store `CTD_seqs_fn_list` as json since lighter-weight and more portable than pickling
CTD_seqs_fn_list_storedfn = "CTD_seqs_fn_list.json"
import json
with open(CTD_seqs_fn_list_storedfn, 'w') as f:
json.dump(CTD_seqs_fn_list, f)
#for ease in aligning or other uses later save the all the CTDs as a concatenated file
cat_fasta_fn = "CTD_seq_of_protein_orthologs.fa"
!cat {" ".join(CTD_seqs_fn_list)} > {cat_fasta_fn}
archiving_fn_list = CTD_seqs_fn_list + strd_dataframes_fn_list + [CTD_seqs_fn_list_storedfn , cat_fasta_fn]
archive_file_name = gene_name+"_orthologs_extracted_CTDs.tar.gz"
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
sys.stderr.write("\nCollected CTD sequences"
" and tables of details gathered and saved as "
"`{}`.".format(archive_file_name))
# Generate / create/ make multi-record FASTA file (multiFASTA. i.e., single FASTA file with multiple sequence entries) with mock / fake / simulate/ random sequence
#!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/nullseq.py
#!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/NullSeq_Functions.py
#%run nullseq.py -l 436
!pip install fire
!curl -O https://raw.githubusercontent.com/mauriceling/bactome/master/randomseq.py
#!python randomseq.py FLS -- --help #???<-- That didn't match typical Python help syntax I was familar with but it is noted at https://github.com/google/python-fire
!python randomseq.py FLS --length=706 --n=3 > mock_seqs.fa
# store FASTA seqeunces as strings within Python scripts but get the sequence back for length
# or sequence as a string
def stringFASTA2seq(s):
'''
Takes a FASTA file contents that are currently as a Python string and
converts it to the sequence string. In other words, it removes the
description line and the line breaks and just makes a sequence string.
'''
l = s.split("\n")
return "".join(l[1:])
example_sequence = '''>OMEGAminusHEG S.cerevisiae omega with homing endonuclease gene and associated DNA removedSPECULATIVE
ATTTACCCCCTTGTCCCATTATATTGAAAAATATAATTATTCAATTAATTATTTAATTGA
AGTAAATTGGGTGAATTGCTTAGATATCCATATAGATAAAAATAATGGACAATAAGCAGC
GAAGCTTATAACAACTTTCATATATGTATATATACGGTTATAAGAACGTTCAACGACTAG
ATGATGAGTGGAGTTAACAATAATTCATCCACGAGCGCCCAATGTCGAATAAATAAAATA
TTAAATAAATTTTTATTTGATAATGATATAGTCTGAACAATATAGTAATATATTGAAGTA
ATTATTTAAATGTAATTACGATAACAAAAAATTTGA
'''
length_example_sequence = len(stringFASTA2seq(example_sequence))
# see my file `about parsing fasta formatted sequence data in biopython.md`
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment