fomightez/useful_FASTA_handling.py

## useful_FASTA_handling.py
# This is not presently all encompassing as it was started well after my sequence work repo
# at https://github.com/fomightez/sequencework , where much of this related code is.


# For making FASTA files/entriees out of dataframes, see 'specific dataframe contents saved as formatted text file example'
# in my useful pandas snippets gist https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4


# For reading in FASTA files/entries
# -See `mine_mito_features` function in `mine_mito_features_from_transcriptome.py`
# https://github.com/fomightez/sequencework/blob/master/Extract_Details_or_Annotation/mine_mito_features_from_transcriptome.py
# -See `get_fasta_seq` and `get_seq_from_URL` functions in `find_sequence_element_occurrences_in_sequence.py` in
# the https://github.com/fomightez/sequencework/tree/master/FindSequence repo
# (direct url: https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py )


# From `patmatch-binder`, `specifically PatMatch nucleic handling flags demystified.ipynb`, and
# highly based on similar handling in `find_sequence_element_occurrences_in_sequence.py`
# (https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py) &
# now adapted into a proper (slightly better) script with command line provideable arguments at
# https://github.com/fomightez/sequencework/blob/master/ConvertSeq/convert_fasta_to_reverse_complement.py :
## READ IN FASTA FILE AND CONVERT TO REVERSE COMPLEMENT USING BIOPYTHON **
from Bio import SeqIO
from Bio.Seq import Seq # for reverse complement

def get_seq_from_file(file_name):
    '''
    takes a file name and gets the sequence RECORD.
    '''
    fasta_iterator = SeqIO.parse(file_name, "fasta")
    # Use of `next()` on next line to get first FASTA -formatted sequence is
    # based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
    # I think difference from `SeqIO.read()` in this approach is that it won't
    # give an error if more than one entry is in the file.
    record = next(fasta_iterator)
    #return record.seq
    #return record.id
    return record

# Read FASTA file
input_file_name = "chr15.fsa"
sequence_record = get_seq_from_file(input_file_name)
#print(sequence_record) #FOR DEBUGGING

# Read multi FASTA file ((in other words single FASTA file with multiple sequences/records multiFASTA)
records = []
for record in SeqIO.parse("TAIR9_pep_20090619.fa", "fasta"):
    records.append(record)
thale_cress_prot_length = []
for record in records:
    #print (len(record.seq))
    thale_cress_prot_length.append(len(record.seq))

# Get reverse complement
seq_rev_compl_record = sequence_record.reverse_complement()
seq_rev_compl_record.id = sequence_record.id #record needs id for writing FASTA
# or even easier, see https://biopython.org/DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html#reverse_complement
# and use argruments to keep id and description when reverse complementing
seq_rev_compl_record = sequence_record.reverse_complement(id=True,description=True)
#print(seq_rev_compl_record) #FOR DEBUGGING

# Save FASTA file for reverse complement
output_file_name = "chr15.revcompl.fa"
SeqIO.write(seq_rev_compl_record, output_file_name, "fasta");


# Save multi fasta (in other words single FASTA file with multiple sequences/records)
SeqIO.write(records,file_name, "fasta");
# based on https://www.biostars.org/p/48797/#48801 "If that's the case then
# note SeqIO.write() can take a list or a generator of SeqRecords so you should pass one of those"


# related to above , but dealing with URL source or file too:
def get_seq_from_URL(url):
    '''
    takes a URL and gets the sequence
    '''
    try:
        from StringIO import StringIO
    except ImportError:
        from io import StringIO

    # Getting html originally for just Python 3, adapted from
    # https://stackoverflow.com/a/17510727/8508004 and then updated from to
    # handle Python 2 and 3 according to same link.
    try:
        # For Python 3.0 and later
        from urllib.request import urlopen
    except ImportError:
        # Fall back to Python 2's urllib2
        from urllib2 import urlopen
    html = urlopen(url)
    fasta_iterator = SeqIO.parse(StringIO(
        html.read().decode(encoding='UTF-8')), "fasta")
    # Use of `next()` on next line to get first FASTA -formatted sequence is
    # based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
    # I think difference from `SeqIO.read()` in this approach is that it won't
    # give an error if more than one entry is in the html.
    # I found I needed `StringIO()` or got issues with trying to handle long file name.
    record = next(fasta_iterator)
    return record.seq

def get_fasta_seq(source):
    '''
    Takes a source URL or filepath/ file name and gets sequence if it is in
    FASTA format.
    It won't return anything if what is provided isn't FASTA format because
    Biopython handles both trying to extract FASTA from URL and from file.
    See https://stackoverflow.com/a/44294079/8508004.
    Placing it in a function it easy to then check and provide some feedback.
    '''
    if source.lower().startswith("http"):
        return get_seq_from_URL(source)
    else:
        # Read sequence, treating source as a filepath.
        # Use of `with` on next line based on http://biopython.org/wiki/SeqIO ,
        # under "Sequence Input". Otherwise, backbone based on
        # https://www.biostars.org/p/209383/, and fact `rU` mode depecated.
        with open(source, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                # print(record.seq) # for debugging
                return record.seq


# remove gaps and make a string into a SeqRecord that can be written to FASTA with `SeqIO.write(records,file_name, "fasta")`
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
new_record = SeqRecord(Seq(text_string, generic_dna).ungap("-"), id= id_, description=id_descript))# based
# on https://www.biostars.org/p/48797/ and `.ungap()` method, see
# https://github.com/biopython/biopython/issues/1511 , and `description`
# from what I've seen for `id` plus https://biopython.org/wiki/SeqIO
# Note that you can leave out `id=` part and just put the necessary string in that
# position in function call because it is a required positoinal argument

#If the 'sequence' is already a Bio.Seq.Seq object, it can be made into a record with an id, or optionnally also a description
new_record = SeqRecord(seq_obj, "XS56782"))
#-or-
new_record = SeqRecord(seq_obj, id= "XS56782", description=id_descript))
#note that a `record.decscription` will also contain the `record.id`; can be removed with `description_wo_id=record.description.split(record.id,1)[1]` so it won't show up twice


# Editing description line of FASTA from within a jupyter notebooks cell
# add identifiers to each `chr` so results for each strain clear later
chromosome_id_prefix = "chr"
def add_strain_id_to_description_line(file,strain_id):
    '''
    Takes a file and edits every description line to add
    strain_id after the caret.

    Saves the fixed file
    '''
    import sys
    output_file_name = "temp.txt"
    # prepare output file for saving so it will be open and ready
    with open(output_file_name, 'w') as output_file:

        # read in the input file
        with open(file, 'r') as input_handler:
            # prepare to give feeback later or allow skipping to certain start
            lines_processed = 0

            for line in input_handler:
                lines_processed += 1
                if line.startswith(">"):
                    rest_o_line = line.split(">")
                    new_line = ">"+strain_id + rest_o_line[1]
                else:
                    new_line = line

                # Send text to output
                output_file.write(new_line)


    # replace the original file with edited
    !mv temp.txt {file}
    # Feedback
    sys.stderr.write("\n{} chromosome identifiers tagged.".format(file))

for s in yue_et_al_strains:
    add_strain_id_to_description_line(s+".genome.fa",s)


#function to get description line text from a FASTA file
def get_descriptionline(filename):
    '''
    Takes the name of file in FASTA format and gets the first description line,
    i.e., the text after the caret on the first line.

    Returns the text after the '>' symbol.
    '''
    import sys
    output_file_name = "temp.txt"
    # prepare output file for saving so it will be open and ready
    with open(output_file_name, 'w') as output_file:

        # read in the input file
        with open(filename, 'r') as input_handler:
            # prepare to give feeback later or allow skipping to certain start
            lines_processed = 0

            for line in input_handler:
                lines_processed += 1
                if lines_process == 1 and line.startswith(">"):
                    return line.split(">")[0].strip()


# save the protein sequence as FASTA
chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to
# multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does)
# Note that I generalized the chunking to a function that can handle any type of
# residue in `get_seq_from_multiFASTA_with_match_in_description.py`
prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range(
    0, len(prot_seq),chunk_size)]
prot_seq_fa = ">" + prot_descr + "\n"+ "\n".join(prot_seq_chunks)
p_output_file_name = strain_id +"_" + gene_name + "_protein_ortholog.fa"
with open(p_output_file_name, 'w') as output:
    output.write(prot_seq_fa)
prot_seqs_fn_list.append(p_output_file_name)
sys.stderr.write("\n\nProtein sequence saved as "
	"`{}`.".format(p_output_file_name))

# above code made into function for working with pyfaidx; used in `blast-binder` repo to make it easier for users
# who already had a multi-FASTA sequence file of sequeneces of interest to plug into a workflow at a later
# point
def seq_to_file(prot_seq, description, name_suffix_to_use):
    '''
    function to save a sequence as a file in FASTA-format
    '''
    # save the protein sequence as FASTA
    chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to
    # multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does)
    # Note that I generalized the chunking to a function that can handle any type of
    # residue in `get_seq_from_multiFASTA_with_match_in_description.py`
    prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range(
        0, len(prot_seq),chunk_size)]
    prot_seq_fa = ">" + description + "\n"+ "\n".join(prot_seq_chunks)
    p_output_file_name = description.split()[0] + name_suffix_to_use
    with open(p_output_file_name, 'w') as output:
        output.write(prot_seq_fa)
    sys.stderr.write("\n\nProtein sequence saved as "
        "`{}`.".format(p_output_file_name))
from pyfaidx import Fasta
records = Fasta(sequence)
for seq in records:
    # make individual file (also see `!faidx --split-files {seq_file}` below)
    seq_to_file(str(seq), seq.long_name, "_protein_ortholog.fa") # I am using `seq.long_name` here because it give more options for adapting the code to make a file name one prefers; however, in developing some other code I became aware that if the FASTA files are non-standard and have an empty line above the description line, that `seq.long_name` will included the caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name, just change here to `seq.name`
    # do SOMETHING WITH THAT FILE
    '''
    BLOCK TO DO SOMETHING HERE
    '''


# One metric to assess is count all the letters present (because shows number of Ns too)
# [BASIC VERSION]
from pyfaidx import Fasta
import collections
for g in genomes:
    concatenated_seqs = ""
    chrs = Fasta(g)
    for x in chrs:
        #print(x.name)
        concatenated_seqs += str(x)
    print(g,":",collections.Counter(concatenated_seqs))

# ['EXPANDED' VERSION]
from pyfaidx import Fasta
import pandas as pd
import collections
nt_counts = {}
for g in genomes:
    strain_id = g.split(".genome.fa")[0]
    concatenated_seqs = ""
    chrs = Fasta(g)
    for x in chrs:
        #print(x.name)
        concatenated_seqs += str(x)
    nt_counts[strain_id] = collections.Counter(concatenated_seqs)
nt_count_df = pd.DataFrame.from_dict(nt_counts, orient='index').fillna(0)
nt_count_df["Total_nts"] = nt_count_df.sum(1)
def percent_calc(items):
    '''
    takes a list of two items and calculates percentage of first item
    within total (second item)
    '''
    return items[0]/items[1]
nt_count_df['% N'] = nt_count_df[['N','Total_nts']].apply(percent_calc, axis=1)
nt_count_df = nt_count_df.style.format({'Total_nts':'{:.2E}','% N':'{:.2%}'})

# Alternatives for handling description lines, i.e.,lines before sequence beginning with `>`, with pyfaidx:
# By default, `.name` attribute just goes to first encountered space in entry description line. You can access full
# description with `.long_name` attribute.    <===NOTE while developing some other code I became aware that if the FASTA
# files are non-standard and have an empty line above the description line, that `seq.long_name` will included the
# caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name,
# it may be easiest to just change to `seq.name` or strip off `>` if the starts with `>` because the first one won't have it.

from pyfaidx import Fasta
genes = Fasta('tests/data/genes.fasta')
for record in genes:
   print(record.name)
   print(record.long_name)


# new in v0.4.9, you can specify the `.name` attribute be the full description line if you specify `read_long_names=True`.
from pyfaidx import Fasta
genes = Fasta('tests/data/genes.fasta', read_long_names=True)
for record in genes:
   print(record.name)

# You can also filter out based on the contents of the description line using 'filt_function'. The have
# simple example in the documentation now that filters on first letter in that line. Below is fancier
# where filters records where description line ends in patterns like '0-0' or '4182-4182'.
seqrecords = Fasta(fn, read_long_names=True,
        filt_function = lambda x: x.split(
        "extracted_coordinates: ")[1].split("-")[0] != x.split(
        "extracted_coordinates: ")[1].split("-")[1])
# Note that in this case I had details on the sequence in the description line. This won't always
# be the case so I could just use an if statement after that to conditionally skip those where
# sequence in the record isn't greater than 1, example `if len(record) > 1:`. I wanted to avoid
# adding yet another if conditional to the processing I was doing where I chose to use `filt_function` to
# filter. It seems I could also interate on the `Fasta`
# object only collecting to a new list the records where the length of the sequence was greater than 1 and then
# iterate over that list of records. Those should be fine for iterating even if I haven't figured out
# how to cast the list of records (each `<class 'pyfaidx.FastaRecord'>`) back to the the `class 'pyfaidx.Fasta'`


# count frequency of blocks of Ns in a string (presumably sequence)
import re
from collections import defaultdict
t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN"
matches = []
len_match_dict = defaultdict(int)
min_number_Ns_in_row_to_collect = 1
pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I)  # adpated from
# code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on
# https://stackoverflow.com/a/250306/8508004
for m in pattern_obj.finditer(t):
    len_match_dict[len(m.group())] += 1
    matches.append(m.group())
print(len_match_dict)
print(collections.Counter(matches))


# Split a file using pyfaidx command line utility that is available in Jupyter when install pyfaidx module
seq_file = "SGDs288CplusPacBio_ADJUSTEDplusWoltersnW303forALIGNERS.fa"
!faidx --split-files {seq_file}


# Package up a lot of sequence files (FASTA) into one archive for easy downloading , PLUS DEMONSTRATES
# merging a lot of inidvidual sequence files into one FASTA file (for Jupyter session)
#Archive the sequences (FASTA format) collected and any dataframes made
# Pickle each dataframe and also save as `tsv` for possible use elsewhere
strd_dataframes_fn_list = []

def pickle_df_and_store_as_table(dataframe, prefix):
    '''
    Take a dataframe and a filename prefix and save a pickled form of that
    dataframe and a text tablular data version (tab-sepearated values).

    Returns the name of the pickled and text file.
    '''
    dataframe.to_pickle(prefix + ".pkl")
    dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
    return prefix + ".pkl", prefix + ".tsv"
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
# associated as the value
df_n_fn_dict = {
    "CTD_seq_of_protein_orthologs": CTD_seq_df,
    "first_heptad_of_protein_orthologs": first_7_df,
    "heptads_ofCTD_seq_of_protein_orthologs": repeat_df,
    "main_heptads_ofCTD_seq_of_protein_orthologs": repeat_wo_first_df,
    "fraction_matching_consensus_per_CTD": fraction_consensus_df,
}
import pandas as pd
for prefix, dataframe in df_n_fn_dict.items():
    #pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
    strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))

# store `CTD_seqs_fn_list` as json since lighter-weight and more portable than pickling
CTD_seqs_fn_list_storedfn = "CTD_seqs_fn_list.json"
import json
with open(CTD_seqs_fn_list_storedfn, 'w') as f:
    json.dump(CTD_seqs_fn_list, f)

#for ease in aligning or other uses later save the all the CTDs as a concatenated file
cat_fasta_fn = "CTD_seq_of_protein_orthologs.fa"
!cat {" ".join(CTD_seqs_fn_list)} > {cat_fasta_fn}


archiving_fn_list = CTD_seqs_fn_list + strd_dataframes_fn_list + [CTD_seqs_fn_list_storedfn , cat_fasta_fn]
archive_file_name = gene_name+"_orthologs_extracted_CTDs.tar.gz"
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
sys.stderr.write("\nCollected CTD sequences"
                 " and tables of details gathered and saved as "
                 "`{}`.".format(archive_file_name))


# Generate / create/ make multi-record FASTA file (multiFASTA. i.e., single FASTA file with multiple sequence entries) with mock / fake / simulate/ random sequence
#!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/nullseq.py
#!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/NullSeq_Functions.py
#%run nullseq.py -l 436
!pip install fire
!curl -O https://raw.githubusercontent.com/mauriceling/bactome/master/randomseq.py
#!python randomseq.py FLS -- --help #???<-- That didn't match typical Python help syntax I was familar with but it is noted at https://github.com/google/python-fire
!python randomseq.py FLS --length=706 --n=3 > mock_seqs.fa


# store FASTA seqeunces as strings within Python scripts but get the sequence back for length
# or sequence as a string
def stringFASTA2seq(s):
    '''
    Takes a FASTA file contents that are currently as a Python string and
    converts it to the sequence string. In other words, it removes the
    description line and the line breaks and just makes a sequence string.
    '''
    l = s.split("\n")
    return "".join(l[1:])

example_sequence  = '''>OMEGAminusHEG S.cerevisiae omega with homing endonuclease gene and associated DNA removedSPECULATIVE
ATTTACCCCCTTGTCCCATTATATTGAAAAATATAATTATTCAATTAATTATTTAATTGA
AGTAAATTGGGTGAATTGCTTAGATATCCATATAGATAAAAATAATGGACAATAAGCAGC
GAAGCTTATAACAACTTTCATATATGTATATATACGGTTATAAGAACGTTCAACGACTAG
ATGATGAGTGGAGTTAACAATAATTCATCCACGAGCGCCCAATGTCGAATAAATAAAATA
TTAAATAAATTTTTATTTGATAATGATATAGTCTGAACAATATAGTAATATATTGAAGTA
ATTATTTAAATGTAATTACGATAACAAAAAATTTGA
'''
length_example_sequence = len(stringFASTA2seq(example_sequence))


# see my file `about parsing fasta formatted sequence data in biopython.md`
	# This is not presently all encompassing as it was started well after my sequence work repo
	# at https://github.com/fomightez/sequencework , where much of this related code is.



	# For making FASTA files/entriees out of dataframes, see 'specific dataframe contents saved as formatted text file example'
	# in my useful pandas snippets gist https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4




	# For reading in FASTA files/entries
	# -See `mine_mito_features` function in `mine_mito_features_from_transcriptome.py`
	# https://github.com/fomightez/sequencework/blob/master/Extract_Details_or_Annotation/mine_mito_features_from_transcriptome.py
	# -See `get_fasta_seq` and `get_seq_from_URL` functions in `find_sequence_element_occurrences_in_sequence.py` in
	# the https://github.com/fomightez/sequencework/tree/master/FindSequence repo
	# (direct url: https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py )




	# From `patmatch-binder`, `specifically PatMatch nucleic handling flags demystified.ipynb`, and
	# highly based on similar handling in `find_sequence_element_occurrences_in_sequence.py`
	# (https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py) &
	# now adapted into a proper (slightly better) script with command line provideable arguments at
	# https://github.com/fomightez/sequencework/blob/master/ConvertSeq/convert_fasta_to_reverse_complement.py :
	## READ IN FASTA FILE AND CONVERT TO REVERSE COMPLEMENT USING BIOPYTHON **
	from Bio import SeqIO
	from Bio.Seq import Seq # for reverse complement

	def get_seq_from_file(file_name):
	'''
	takes a file name and gets the sequence RECORD.
	'''
	fasta_iterator = SeqIO.parse(file_name, "fasta")
	# Use of `next()` on next line to get first FASTA -formatted sequence is
	# based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
	# I think difference from `SeqIO.read()` in this approach is that it won't
	# give an error if more than one entry is in the file.
	record = next(fasta_iterator)
	#return record.seq
	#return record.id
	return record

	# Read FASTA file
	input_file_name = "chr15.fsa"
	sequence_record = get_seq_from_file(input_file_name)
	#print(sequence_record) #FOR DEBUGGING

	# Read multi FASTA file ((in other words single FASTA file with multiple sequences/records multiFASTA)
	records = []
	for record in SeqIO.parse("TAIR9_pep_20090619.fa", "fasta"):
	records.append(record)
	thale_cress_prot_length = []
	for record in records:
	#print (len(record.seq))
	thale_cress_prot_length.append(len(record.seq))

	# Get reverse complement
	seq_rev_compl_record = sequence_record.reverse_complement()
	seq_rev_compl_record.id = sequence_record.id #record needs id for writing FASTA
	# or even easier, see https://biopython.org/DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html#reverse_complement
	# and use argruments to keep id and description when reverse complementing
	seq_rev_compl_record = sequence_record.reverse_complement(id=True,description=True)
	#print(seq_rev_compl_record) #FOR DEBUGGING

	# Save FASTA file for reverse complement
	output_file_name = "chr15.revcompl.fa"
	SeqIO.write(seq_rev_compl_record, output_file_name, "fasta");


	# Save multi fasta (in other words single FASTA file with multiple sequences/records)
	SeqIO.write(records,file_name, "fasta");
	# based on https://www.biostars.org/p/48797/#48801 "If that's the case then
	# note SeqIO.write() can take a list or a generator of SeqRecords so you should pass one of those"


	# related to above , but dealing with URL source or file too:
	def get_seq_from_URL(url):
	'''
	takes a URL and gets the sequence
	'''
	try:
	from StringIO import StringIO
	except ImportError:
	from io import StringIO

	# Getting html originally for just Python 3, adapted from
	# https://stackoverflow.com/a/17510727/8508004 and then updated from to
	# handle Python 2 and 3 according to same link.
	try:
	# For Python 3.0 and later
	from urllib.request import urlopen
	except ImportError:
	# Fall back to Python 2's urllib2
	from urllib2 import urlopen
	html = urlopen(url)
	fasta_iterator = SeqIO.parse(StringIO(
	html.read().decode(encoding='UTF-8')), "fasta")
	# Use of `next()` on next line to get first FASTA -formatted sequence is
	# based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
	# I think difference from `SeqIO.read()` in this approach is that it won't
	# give an error if more than one entry is in the html.
	# I found I needed `StringIO()` or got issues with trying to handle long file name.
	record = next(fasta_iterator)
	return record.seq

	def get_fasta_seq(source):
	'''
	Takes a source URL or filepath/ file name and gets sequence if it is in
	FASTA format.
	It won't return anything if what is provided isn't FASTA format because
	Biopython handles both trying to extract FASTA from URL and from file.
	See https://stackoverflow.com/a/44294079/8508004.
	Placing it in a function it easy to then check and provide some feedback.
	'''
	if source.lower().startswith("http"):
	return get_seq_from_URL(source)
	else:
	# Read sequence, treating source as a filepath.
	# Use of `with` on next line based on http://biopython.org/wiki/SeqIO ,
	# under "Sequence Input". Otherwise, backbone based on
	# https://www.biostars.org/p/209383/, and fact `rU` mode depecated.
	with open(source, "r") as handle:
	for record in SeqIO.parse(handle, "fasta"):
	# print(record.seq) # for debugging
	return record.seq


	# remove gaps and make a string into a SeqRecord that can be written to FASTA with `SeqIO.write(records,file_name, "fasta")`
	from Bio.Alphabet import generic_dna
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	new_record = SeqRecord(Seq(text_string, generic_dna).ungap("-"), id= id_, description=id_descript))# based
	# on https://www.biostars.org/p/48797/ and `.ungap()` method, see
	# https://github.com/biopython/biopython/issues/1511 , and `description`
	# from what I've seen for `id` plus https://biopython.org/wiki/SeqIO
	# Note that you can leave out `id=` part and just put the necessary string in that
	# position in function call because it is a required positoinal argument

	#If the 'sequence' is already a Bio.Seq.Seq object, it can be made into a record with an id, or optionnally also a description
	new_record = SeqRecord(seq_obj, "XS56782"))
	#-or-
	new_record = SeqRecord(seq_obj, id= "XS56782", description=id_descript))
	#note that a `record.decscription` will also contain the `record.id`; can be removed with `description_wo_id=record.description.split(record.id,1)[1]` so it won't show up twice


	# Editing description line of FASTA from within a jupyter notebooks cell
	# add identifiers to each `chr` so results for each strain clear later
	chromosome_id_prefix = "chr"
	def add_strain_id_to_description_line(file,strain_id):
	'''
	Takes a file and edits every description line to add
	strain_id after the caret.

	Saves the fixed file
	'''
	import sys
	output_file_name = "temp.txt"
	# prepare output file for saving so it will be open and ready
	with open(output_file_name, 'w') as output_file:

	# read in the input file
	with open(file, 'r') as input_handler:
	# prepare to give feeback later or allow skipping to certain start
	lines_processed = 0

	for line in input_handler:
	lines_processed += 1
	if line.startswith(">"):
	rest_o_line = line.split(">")
	new_line = ">"+strain_id + rest_o_line[1]
	else:
	new_line = line

	# Send text to output
	output_file.write(new_line)


	# replace the original file with edited
	!mv temp.txt {file}
	# Feedback
	sys.stderr.write("\n{} chromosome identifiers tagged.".format(file))

	for s in yue_et_al_strains:
	add_strain_id_to_description_line(s+".genome.fa",s)


	#function to get description line text from a FASTA file
	def get_descriptionline(filename):
	'''
	Takes the name of file in FASTA format and gets the first description line,
	i.e., the text after the caret on the first line.

	Returns the text after the '>' symbol.
	'''
	import sys
	output_file_name = "temp.txt"
	# prepare output file for saving so it will be open and ready
	with open(output_file_name, 'w') as output_file:

	# read in the input file
	with open(filename, 'r') as input_handler:
	# prepare to give feeback later or allow skipping to certain start
	lines_processed = 0

	for line in input_handler:
	lines_processed += 1
	if lines_process == 1 and line.startswith(">"):
	return line.split(">")[0].strip()



	# save the protein sequence as FASTA
	chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to
	# multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does)
	# Note that I generalized the chunking to a function that can handle any type of
	# residue in `get_seq_from_multiFASTA_with_match_in_description.py`
	prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range(
	0, len(prot_seq),chunk_size)]
	prot_seq_fa = ">" + prot_descr + "\n"+ "\n".join(prot_seq_chunks)
	p_output_file_name = strain_id +"_" + gene_name + "_protein_ortholog.fa"
	with open(p_output_file_name, 'w') as output:
	output.write(prot_seq_fa)
	prot_seqs_fn_list.append(p_output_file_name)
	sys.stderr.write("\n\nProtein sequence saved as "
	"`{}`.".format(p_output_file_name))

	# above code made into function for working with pyfaidx; used in `blast-binder` repo to make it easier for users
	# who already had a multi-FASTA sequence file of sequeneces of interest to plug into a workflow at a later
	# point
	def seq_to_file(prot_seq, description, name_suffix_to_use):
	'''
	function to save a sequence as a file in FASTA-format
	'''
	# save the protein sequence as FASTA
	chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to
	# multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does)
	# Note that I generalized the chunking to a function that can handle any type of
	# residue in `get_seq_from_multiFASTA_with_match_in_description.py`
	prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range(
	0, len(prot_seq),chunk_size)]
	prot_seq_fa = ">" + description + "\n"+ "\n".join(prot_seq_chunks)
	p_output_file_name = description.split()[0] + name_suffix_to_use
	with open(p_output_file_name, 'w') as output:
	output.write(prot_seq_fa)
	sys.stderr.write("\n\nProtein sequence saved as "
	"`{}`.".format(p_output_file_name))
	from pyfaidx import Fasta
	records = Fasta(sequence)
	for seq in records:
	# make individual file (also see `!faidx --split-files {seq_file}` below)
	seq_to_file(str(seq), seq.long_name, "_protein_ortholog.fa") # I am using `seq.long_name` here because it give more options for adapting the code to make a file name one prefers; however, in developing some other code I became aware that if the FASTA files are non-standard and have an empty line above the description line, that `seq.long_name` will included the caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name, just change here to `seq.name`
	# do SOMETHING WITH THAT FILE
	'''
	BLOCK TO DO SOMETHING HERE
	'''


	# One metric to assess is count all the letters present (because shows number of Ns too)
	# [BASIC VERSION]
	from pyfaidx import Fasta
	import collections
	for g in genomes:
	concatenated_seqs = ""
	chrs = Fasta(g)
	for x in chrs:
	#print(x.name)
	concatenated_seqs += str(x)
	print(g,":",collections.Counter(concatenated_seqs))

	# ['EXPANDED' VERSION]
	from pyfaidx import Fasta
	import pandas as pd
	import collections
	nt_counts = {}
	for g in genomes:
	strain_id = g.split(".genome.fa")[0]
	concatenated_seqs = ""
	chrs = Fasta(g)
	for x in chrs:
	#print(x.name)
	concatenated_seqs += str(x)
	nt_counts[strain_id] = collections.Counter(concatenated_seqs)
	nt_count_df = pd.DataFrame.from_dict(nt_counts, orient='index').fillna(0)
	nt_count_df["Total_nts"] = nt_count_df.sum(1)
	def percent_calc(items):
	'''
	takes a list of two items and calculates percentage of first item
	within total (second item)
	'''
	return items[0]/items[1]
	nt_count_df['% N'] = nt_count_df[['N','Total_nts']].apply(percent_calc, axis=1)
	nt_count_df = nt_count_df.style.format({'Total_nts':'{:.2E}','% N':'{:.2%}'})

	# Alternatives for handling description lines, i.e.,lines before sequence beginning with `>`, with pyfaidx:
	# By default, `.name` attribute just goes to first encountered space in entry description line. You can access full
	# description with `.long_name` attribute. <===NOTE while developing some other code I became aware that if the FASTA
	# files are non-standard and have an empty line above the description line, that `seq.long_name` will included the
	# caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name,
	# it may be easiest to just change to `seq.name` or strip off `>` if the starts with `>` because the first one won't have it.

	from pyfaidx import Fasta
	genes = Fasta('tests/data/genes.fasta')
	for record in genes:
	print(record.name)
	print(record.long_name)


	# new in v0.4.9, you can specify the `.name` attribute be the full description line if you specify `read_long_names=True`.
	from pyfaidx import Fasta
	genes = Fasta('tests/data/genes.fasta', read_long_names=True)
	for record in genes:
	print(record.name)

	# You can also filter out based on the contents of the description line using 'filt_function'. The have
	# simple example in the documentation now that filters on first letter in that line. Below is fancier
	# where filters records where description line ends in patterns like '0-0' or '4182-4182'.
	seqrecords = Fasta(fn, read_long_names=True,
	filt_function = lambda x: x.split(
	"extracted_coordinates: ")[1].split("-")[0] != x.split(
	"extracted_coordinates: ")[1].split("-")[1])
	# Note that in this case I had details on the sequence in the description line. This won't always
	# be the case so I could just use an if statement after that to conditionally skip those where
	# sequence in the record isn't greater than 1, example `if len(record) > 1:`. I wanted to avoid
	# adding yet another if conditional to the processing I was doing where I chose to use `filt_function` to
	# filter. It seems I could also interate on the `Fasta`
	# object only collecting to a new list the records where the length of the sequence was greater than 1 and then
	# iterate over that list of records. Those should be fine for iterating even if I haven't figured out
	# how to cast the list of records (each `<class 'pyfaidx.FastaRecord'>`) back to the the `class 'pyfaidx.Fasta'`


	# count frequency of blocks of Ns in a string (presumably sequence)
	import re
	from collections import defaultdict
	t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN"
	matches = []
	len_match_dict = defaultdict(int)
	min_number_Ns_in_row_to_collect = 1
	pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I) # adpated from
	# code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on
	# https://stackoverflow.com/a/250306/8508004
	for m in pattern_obj.finditer(t):
	len_match_dict[len(m.group())] += 1
	matches.append(m.group())
	print(len_match_dict)
	print(collections.Counter(matches))


	# Split a file using pyfaidx command line utility that is available in Jupyter when install pyfaidx module
	seq_file = "SGDs288CplusPacBio_ADJUSTEDplusWoltersnW303forALIGNERS.fa"
	!faidx --split-files {seq_file}




	# Package up a lot of sequence files (FASTA) into one archive for easy downloading , PLUS DEMONSTRATES
	# merging a lot of inidvidual sequence files into one FASTA file (for Jupyter session)
	#Archive the sequences (FASTA format) collected and any dataframes made
	# Pickle each dataframe and also save as `tsv` for possible use elsewhere
	strd_dataframes_fn_list = []

	def pickle_df_and_store_as_table(dataframe, prefix):
	'''
	Take a dataframe and a filename prefix and save a pickled form of that
	dataframe and a text tablular data version (tab-sepearated values).

	Returns the name of the pickled and text file.
	'''
	dataframe.to_pickle(prefix + ".pkl")
	dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
	return prefix + ".pkl", prefix + ".tsv"
	# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
	# associated as the value
	df_n_fn_dict = {
	"CTD_seq_of_protein_orthologs": CTD_seq_df,
	"first_heptad_of_protein_orthologs": first_7_df,
	"heptads_ofCTD_seq_of_protein_orthologs": repeat_df,
	"main_heptads_ofCTD_seq_of_protein_orthologs": repeat_wo_first_df,
	"fraction_matching_consensus_per_CTD": fraction_consensus_df,
	}
	import pandas as pd
	for prefix, dataframe in df_n_fn_dict.items():
	#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
	strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))

	# store `CTD_seqs_fn_list` as json since lighter-weight and more portable than pickling
	CTD_seqs_fn_list_storedfn = "CTD_seqs_fn_list.json"
	import json
	with open(CTD_seqs_fn_list_storedfn, 'w') as f:
	json.dump(CTD_seqs_fn_list, f)

	#for ease in aligning or other uses later save the all the CTDs as a concatenated file
	cat_fasta_fn = "CTD_seq_of_protein_orthologs.fa"
	!cat {" ".join(CTD_seqs_fn_list)} > {cat_fasta_fn}


	archiving_fn_list = CTD_seqs_fn_list + strd_dataframes_fn_list + [CTD_seqs_fn_list_storedfn , cat_fasta_fn]
	archive_file_name = gene_name+"_orthologs_extracted_CTDs.tar.gz"
	!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
	sys.stderr.write("\nCollected CTD sequences"
	" and tables of details gathered and saved as "
	"`{}`.".format(archive_file_name))


	# Generate / create/ make multi-record FASTA file (multiFASTA. i.e., single FASTA file with multiple sequence entries) with mock / fake / simulate/ random sequence
	#!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/nullseq.py
	#!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/NullSeq_Functions.py
	#%run nullseq.py -l 436
	!pip install fire
	!curl -O https://raw.githubusercontent.com/mauriceling/bactome/master/randomseq.py
	#!python randomseq.py FLS -- --help #???<-- That didn't match typical Python help syntax I was familar with but it is noted at https://github.com/google/python-fire
	!python randomseq.py FLS --length=706 --n=3 > mock_seqs.fa



	# store FASTA seqeunces as strings within Python scripts but get the sequence back for length
	# or sequence as a string
	def stringFASTA2seq(s):
	'''
	Takes a FASTA file contents that are currently as a Python string and
	converts it to the sequence string. In other words, it removes the
	description line and the line breaks and just makes a sequence string.
	'''
	l = s.split("\n")
	return "".join(l[1:])

	example_sequence = '''>OMEGAminusHEG S.cerevisiae omega with homing endonuclease gene and associated DNA removedSPECULATIVE
	ATTTACCCCCTTGTCCCATTATATTGAAAAATATAATTATTCAATTAATTATTTAATTGA
	AGTAAATTGGGTGAATTGCTTAGATATCCATATAGATAAAAATAATGGACAATAAGCAGC
	GAAGCTTATAACAACTTTCATATATGTATATATACGGTTATAAGAACGTTCAACGACTAG
	ATGATGAGTGGAGTTAACAATAATTCATCCACGAGCGCCCAATGTCGAATAAATAAAATA
	TTAAATAAATTTTTATTTGATAATGATATAGTCTGAACAATATAGTAATATATTGAAGTA
	ATTATTTAAATGTAATTACGATAACAAAAAATTTGA
	'''
	length_example_sequence = len(stringFASTA2seq(example_sequence))





	# see my file `about parsing fasta formatted sequence data in biopython.md`