fomightez/useful_notebook_snippets

## useful_notebook_snippets
# Use `%%capture` to hush 'noisy' stdout and stderr streams, but still combine with getting `%%time` after
%%capture out_stream
%%time
---rest of a cell that does something with LOTS of output--
#In cell after, put following to get time of completion from that:
#time it took to run cell above
for x in out_stream.stdout.split("\n")[-3:]:
    print(x)


# Use `%%capture` to hush 'noisy' stdout and stderr streams, but still get certain output after
%%capture out_stream
---rest of a cell that does something with LOTS of output with anything to keep tagged with `#x#x#x#x#x` at start of every line --
#In cell after, put following to get filtered output:
# output specifically tagged stderr lines captured from above cell
tag_used = "#x#x#x#x#x"
filtered_out = ""
for x in out_stream.stderr.split("\n"):
    if x.startswith(tag_used):
        filtered_out += x[len(tag_used):]+"\n"
# Feedback
sys.stderr.write("{}".format(filtered_out))


# Use `%%capture` to capture' stdout and stderr streams and send output to a file
%%capture out_stream
---rest of a cell that does something with output--
#In cell after, put following:
%store out_stream.stdout >output_from_cell.txt #based on https://stackoverflow.com/a/32731418/8508004
# (In an answer to a Jupyter Discourse post, I added more background on using this and options for how
# you could add showing the captured text in the notebook, too. See
# https://discourse.jupyter.org/t/how-to-write-the-output-from-previous-cell-to-a-csv-file/10319/2?u=fomightez )

# Use `with io.capture_output() as captured:` to suppress output from only what is in the `with` block
# This comes from down below on that same page as the `%%capture` cell magic (https://stackoverflow.com/a/52559560/8508004)
from IPython.utils import io
with io.capture_output() as captured:
    MyFunction()


# Use %store and doscstring to write multi-line text to file
s='''#!/bin/bash
pdb=$1
for chain in $(grep "^ATOM" $pdb | cut -b 22 | sort -u)
do
    sed -n "/^.\{21\}$chain/p" $pdb > ${pdb%.pdb}_$chain.pdb
done'''
%store s >split_into_chains.sh


# clean out directory of all but one file
from shlex import quote
pathname_of_file_to_keep = quote("notebooks/Generating later Circos tutorial notebooks from extracted markdown via notedown and papermill.ipynb")
name_of_file_to_keep = quote("Generating later Circos tutorial notebooks from extracted markdown via notedown and papermill.ipynb")

# based on Olivier Dulac's comment at https://unix.stackexchange.com/questions/153862/remove-all-files-directories-except-for-one-file
%cd ..
!cp $pathname_of_file_to_keep .
!rm -rf notebooks
!mkdir notebooks
!mv $name_of_file_to_keep notebooks/
%cd notebooks


# Make a directory if it doesn't already exist
# Make a folder if it doesn't already exist
import os
directory_for_archive = "original_html"
if not os.path.isdir(directory_for_archive):
    !mkdir {directory_for_archive}


# Get a file if not yet retrieved / check if file exists
import os
file_needed = "get_seq_from_multiFASTA_with_match_in_description.py"
if not os.path.isfile(file_needed):
    #!curl -OL https://raw.githubusercontent.com/fomightez/sequencework/master/Extract_from_FASTA/{file_needed}
    os.system(f"curl -OL https://raw.githubusercontent.com/fomightez/sequencework/master/Extract_from_FASTA/{file_needed}")

# Get a list of files if not yet retrieved, checking if file exists already first
import os
files_needed = ["hhsearch_q9bsu1_uniclust_w_ss_pfamA_30.hhr",
                "2uvo_hhblits.hhr",
                "2uvo_hhsearch.hhr",
                "hhpred_9590198.hhr"]
url_prefix = "https://raw.githubusercontent.com/biopython/biopython/master/Tests/HHsuite/"
for file_needed in files_needed:
    if not os.path.isfile(file_needed):
        !curl -OL {url_prefix+file_needed}


# Check if list of file already uploaded and if not, prompt for them
import os
import sys
files_needed = ["genome_1.fa","genome_2.fa","genome_3.fa","genome_4.fa",]
files_still_needed = []
for fn in files_needed:
    if not os.path.isfile(fn):
        files_still_needed.append(fn)
if files_still_needed:
    sys.stderr.write("\nThe following sequences still need uploading:\n - {}\n".format("\n - ".join(files_still_needed)))
    sys.exit(1)
else:
    sys.stderr.write("\nSequences needed all present.")


# Check for a file that is in an archive and then ask for archive if file not there
# and don't find the archive. Particularly useful in Binder sessions to make sure
# needed files are around and ready to run cells. HALTS NOTEBOOK CELL PROCESSING IF NOT.

# first check `an_archive_example.tar.gz` uploaded if it wasn't already extracted
import os
unpacked_example = os.path.join("directory_containing_file_when_unpacked","your_alignment_file.clustal")
file_needed = "an_archive_example.tar.gz"

import sys
if os.path.isfile(unpacked_example):
    sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed))
elif os.path.isfile(file_needed):
    !tar xzf {file_needed}
else:
    sys.stderr.write("\n\n*****************ERROR**************************\n"
        "The file '{0}' is needed.\n"
        "Upload '{0}' to this Jupyter session and re-run this cell.\n"
        "*****************ERROR**************************\n".format(file_needed))
    sys.exit(1)

# Ask for an archive and unpack and extract enclosed dataframe
file_required = "collected_candidate_21S-containing_seqs.tar.gz"
dataframe_to_read = "extracted_21S-containing_seq_info_df.pkl"
import os
import sys
import pandas as pd
if os.path.isfile(file_required):
    !tar -xzf {file_required}
    mitolsu_frag_df = pd.read_pickle(dataframe_to_read)
    sys.stderr.write("\nFile with sequences ('{}') observed and"
        " unpacked.".format(file_required))
    sys.stderr.write("\nDataframe '{}' read in"
        ".".format(dataframe_to_read))
else:
    sys.stderr.write("\nFile with sequences '{}' not seen and so nothing done"
        ". Seems wrong.".format(file_required))
    sys.exit(1)


# for when that archive to check for contains a dataframe to bring into memory in the notebook:
import os
file_needed = "an_archive_example.tar.gz"
unpacked_goal = "info_df.pkl"

import sys
import pandas as pd
if os.path.isfile(unpacked_goal):
    sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed))
    #bring the details in
    try:
      len(previous_details_df) > 2
    except NameError as e:
        previous_details_df = pd.read_pickle(unpacked_goal)
        sys.stderr.write("\nData in '{}' read in.\n".format(unpacked_goal))
elif os.path.isfile(file_needed):
    !tar xzf {file_needed}
    previous_details_df = pd.read_pickle(unpacked_goal)
    sys.stderr.write("\nData in '{}' read in.\n".format(unpacked_goal))
else:
    sys.stderr.write("\n\n*****************ERROR**************************\n"
        "The file '{0}' is needed.\n"
        "Upload '{0}' to this Jupyter session and re-run this cell.\n"
        "*****************ERROR**************************\n".format(file_needed))
    sys.exit(1)


# for when that archive to check for contains several dataframes and a list to bring into memory in the notebook (SEE JUST BELOW FOR MORE GENERAL / ONLY DATAFRAMES):
import os
file_needed = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz"
unpacked_goal = "disruptor_hit_num_tallies_by_id_df.pkl"

df_n_fnstr_dict = {
    "largest_disr_num_by_id_df": "largest_disr_num_by_id_df",
    "mito_promoter_matches_df": "df",
    "mito_promoter_hit_num_tallies_by_id_df": "largest_hit_num_by_id_df",
    "disruptor_matches_df": "disrupt_df",
    "disruptor_hit_num_tallies_by_id_df": "largest_disr_num_by_id_df",
    "grich_matches_df": "grich_df",
    "grich_hit_num_tallies_by_id_df": "largest_grich_num_by_id_df",
    "endgrich_matches_df": "end_grich_df",
    "endgrich_hit_num_tallies_by_id_df": "largest_endgrich_num_by_id_df",
    "twenty_nineATrich_seq_matches_df": "twenty_nine_df",
    "twenty_nineATrich_seq_hit_num_tallies_by_id_df": "largest_ATrich_num_by_id_df",
}
def read_in_data(df_n_fnstr_dict):
    #df_fns = ["{}.pkl".format(x) for x in df_n_fnstr_dict.keys()]
    df_n_fnstr_dict = {"{}.pkl".format(k):v for k,v in df_n_fnstr_dict.items()}
    g = globals() #based on `how to use a string to make a python variable.md`
    for k,v in df_n_fnstr_dict.items():
        g[v] = pd.read_pickle(k)
        sys.stderr.write("\nData in '{}' read in; produced `{}`.".format(k,v))
    import json
    with open('genomes_list.json', 'r') as f:
        g["genomes"] = json.load(f)
    sys.stderr.write("\nGenomes list read back in as `genomes`.")

import sys
import pandas as pd
if os.path.isfile(unpacked_goal):
    sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed))
    #bring the data into memory, if it isn't already
    try:
      len(globals()[list(df_n_fnstr_dict.items())[0][1]]) > 2
    except (NameError,KeyError) as e:
        read_in_data(df_n_fnstr_dict)
elif os.path.isfile(file_needed):
    !tar xzf {file_needed}
    read_in_data(df_n_fnstr_dict)
else:
    sys.stderr.write("\n\n*****************ERROR**************************\n"
        "The file '{0}' is needed.\n"
        "Upload '{0}' to this Jupyter session and re-run this cell.\n"
        "*****************ERROR**************************\n".format(file_needed))
    sys.exit(1)


## MORE GENERAL VERSION OF THAT LAST ONE THAT DOESN'T INCLUDE ANY LIST TO READ IN
import os
file_needed = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz"
unpacked_goal = "disruptor_hit_num_tallies_by_id_df.pkl"

df_n_fnstr_dict = {
    "largest_disr_num_by_id_df": "largest_disr_num_by_id_df",
    "mito_promoter_matches_df": "df",
    "mito_promoter_hit_num_tallies_by_id_df": "largest_hit_num_by_id_df",
    "disruptor_matches_df": "disrupt_df",
    "disruptor_hit_num_tallies_by_id_df": "largest_disr_num_by_id_df",
    "grich_matches_df": "grich_df",
    "grich_hit_num_tallies_by_id_df": "largest_grich_num_by_id_df",
    "endgrich_matches_df": "end_grich_df",
    "endgrich_hit_num_tallies_by_id_df": "largest_endgrich_num_by_id_df",
    "twenty_nineATrich_seq_matches_df": "twenty_nine_df",
    "twenty_nineATrich_seq_hit_num_tallies_by_id_df": "largest_ATrich_num_by_id_df",
}
def read_in_pickles(df_n_fnstr_dict):
    #df_fns = ["{}.pkl".format(x) for x in df_n_fnstr_dict.keys()]
    df_n_fnstr_dict = {"{}.pkl".format(k):v for k,v in df_n_fnstr_dict.items()}
    g = globals() #based on `how to use a string to make a python variable.md`
    for k,v in df_n_fnstr_dict.items():
        g[v] = pd.read_pickle(k)
        sys.stderr.write("\nData in '{}' read in; produced `{}`.".format(k,v))

import sys
import pandas as pd
if os.path.isfile(unpacked_goal):
    sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed))
    #bring the data into memory, if it isn't already
    try:
      len(globals()[list(df_n_fnstr_dict.items())[0][1]]) > 2
    except (NameError,KeyError) as e:
        read_in_pickles(df_n_fnstr_dict)
elif os.path.isfile(file_needed):
    !tar xzf {file_needed}
    read_in_pickles(df_n_fnstr_dict)
else:
    sys.stderr.write("\n\n*****************ERROR**************************\n"
        "The file '{0}' is needed.\n"
        "Upload '{0}' to this Jupyter session and re-run this cell.\n"
        "*****************ERROR**************************\n".format(file_needed))
    sys.exit(1)


# check single file uploaded
file_required = "collected_seqs.tar.gz" # usually in another cell
import os
import sys
try:
    os.path.isfile(file_required)
except NameError:
    file_required = "collected_seqs.tar.gz"
if os.path.isfile(file_required):
    !tar -xzf collected_seqs.tar.gz
    !mv collected_seqs/* .
    !rm -rf collected__seqs
    sys.stderr.write("\nFile with sequences ('{}') observed and"
        " unpacked.".format(file_required))
else:
    sys.stderr.write("\nFile with sequences '{}' not seen and so nothing done"
        ". Seems wrong.".format(file_required))
    sys.exit(1)

# Check single file uploaded with check on size
file_required = ""0_332yeast_genomesFROMshenETal2018.zip" # usually in another cell
size_expected = 2.902e+09 # in bytes # usually in another cell
# Upload the file prior to running this cell
import os
import sys
try:
    os.path.isfile(file_required)
except NameError:
    file_required = "0_332yeast_genomesFROMshenETal2018.zip"
if os.path.isfile(file_required):
    # make sure it is large as it should be since it takes so long to upload
    f_size = os.path.getsize(file_required) # based on https://stackoverflow.com/a/2104083/8508004
    if f_size >= size_expected:
        !mkdir genomes
        !unzip -q 0_332yeast_genomesFROMshenETal2018.zip
        !unzip -q 0_332yeast_genomes/332_genome_assemblies.zip
        !mv *.fas genomes/.
        sys.stderr.write("\nGenomes archive ('{}') observed and"
            " unpacked.".format(file_required))
    else:
        sys.stderr.write("\nGenomes archive ('{}') observed but is not"
            " fully uploaded\nWait and run this cell again.".format(file_required))
else:
    sys.stderr.write("\nGenomes archive '{}' not seen and so nothing done"
        ". Seems wrong.".format(file_required))
    sys.exit(1)

 #someone else's take on some of these concepts is in post at https://twitter.com/radekosmulski/status/1129116929589940232


# check multiple files uploaded
import os
import sys
import pandas as pd
try:
    type(files_required)
except NameError:
    print("Setting `files_required`")
    files_required = ["PB_n_1011_collection_df.pkl","other_all_stretchesN_df.pkl"]
for file_required in files_required:
    if os.path.isfile(file_required):
        if file_required == files_required[0]:
            all_df = pd.read_pickle(file_required)
        else:
            other_df = pd.read_pickle(file_required)
        sys.stderr.write("\nFile '{}' observed and"
            " unpickled.".format(file_required))
    else:
        sys.stderr.write("\nFile'{}' not seen and so nothing done"
            ".\nSeems wrong!??!\n\n".format(file_required))
        sys.exit(1)


#Check if a large remote archive already unpacked and retrieved. If not,
# take care of whatever is left to do to use result. (For example, if the directory was
# set up via Cyverse to already have the archive, no need to retrieve it now
# but want to unpack it.
import os
import sys
archive_fn = "1011Assemblies.tar.gz"
archive_url = "http://1002genomes.u-strasbg.fr/files/1011Assemblies.tar.gz"
num_files_in_archive = 1011
genomes_dir = 'GENOMES_ASSEMBLED'
expected_unpacked_fn = genomes_dir+"/"+"YBV.re.fa"

def unpack_and_delete_lrg_archive(archive_fn):
    !tar -xzf {archive_fn}
    if len(os.listdir(genomes_dir)) >= num_files_in_archive:
        !rm {archive_fn}
    sys.stderr.write("\nFile with genomes ('{}') observed and"
        " unpacked.".format(archive_fn))

if os.path.isfile(expected_unpacked_fn):
    sys.stderr.write("\n**Nothing Done. Genomes from '{}' already obtained &"
        " unpacked.**".format(archive_fn))
else:
    if os.path.isfile(
        archive_fn) and not os.path.isfile(expected_unpacked_fn):
        unpack_and_delete_lrg_archive(archive_fn)
    if not os.path.isfile(
        archive_fn) and not os.path.isfile(expected_unpacked_fn):
        sys.stderr.write("\nGenome sequences not seen, and so obtaining"
            " '{}'".format(archive_fn))
        #!curl -O {archive_url}
        os.system(f"curl -O {archive_url}")
        unpack_and_delete_lrg_archive(archive_fn)
    else:
        sys.stderr.write("\nSomething seems wrong.")
        sys.exit(1)


# Manage files with `fnmatch` (see just above about whether file uploaded, too)

# Basic fnmatch use
import fnmatch
for file in os.listdir(genomes_dir):
    if fnmatch.fnmatch(file, '*.re.fa'):
        !perl patmatch_1.2/unjustify_fasta.pl {genomes_dir}/{file}
        #os.remove(os.path.join(genomes_dir, file)) #left over from development
        output = !perl patmatch_1.2/patmatch.pl -c {promoter_pattern} {genomes_dir}/{file}.prepared
        os.remove(os.path.join(genomes_dir, file+".prepared")) #delete file made for PatMatch
        df = patmatch_results_to_df(output.n, pattern=promoter_pattern, name="promoter")

# more fnmatch basic use
tag_to_add ="1G03"
import os
import sys
import fnmatch
model_pattern = "model_*.pdb"
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, model_pattern):
        os.rename(file, tag_to_add + file)


# fnmatch use combined with checking id related file yet exists
# categorize those annotated already and those missed
import os
import sys
import fnmatch
extension_to_check = ".fa"
extension_to_see_if_exists = ".new"
num_checked = 0
not_annotated = []
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*'+extension_to_check):
        num_checked += 1
        #print (file)
        first_part_filen = file.rsplit(extension_to_check,1)[0]
        # check if corresponding `.new` file exists
        annotated_file = file+extension_to_see_if_exists
        #print(annotated_file)
        if os.path.isfile(annotated_file):
            pass
        else:
            not_annotated.append(file)
            print ("No {} file?".format(annotated_file))
# Feedback
sys.stderr.write("{} sequences files checked; {} lack corresponding, "
    "\nannotated `.new` files.".format(num_checked,len(not_annotated)))
sys.stderr.write("\nThe variable `not_annotated` lists the sequences missing annotated files.")


#fnmatch to make a list of files and then do something with related files (see below how to use `glob.glob()` if just needed a list and
# weren't doing something with the names of files as encountered) (I added an example considering both when do need to iterate on many
# files and rename at https://discourse.jupyter.org/t/rename-files-using-a-for/17144/2?u=fomightez )(Example with using glob or fnmatch to get base file names is at https://www.biostars.org/p/9539595/#9548023 ; keep in mind in relation to basename is .stem from Path, see https://stackoverflow.com/a/47496703/8508004)
import os
import sys
import fnmatch
extension_to_handle = ".gff3"
name_part_to_match = "mito.gff3"
associated_mito_noms= []
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*'+name_part_to_match):
        #print (file)
        first_part_filen = file.rsplit(extension_to_handle,1)[0]
        associated_mito_noms.append(first_part_filen)
# Now delete any files that end in `mito.fa` that are not in the list of the annotation files
extension_to_handle = ".fa"
name_part_to_match = "_mito.fa"
removed = 0
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*'+name_part_to_match):
        first_part_filen = file.rsplit(extension_to_handle,1)[0]
        if (first_part_filen) not in associated_mito_noms:
            os.remove(file)
            removed += 1
sys.stderr.write("\n{} files ending in `{}` removed"
    ".".format(removed,name_part_to_match))

# use fnmatch and glob in a notebook to iterate on all `.py` Python script files in a directory and run them. Even subsequent ones made by the
# scripts 'dynamically' in the course of running (was to answer a StackOverflow question, see https://stackoverflow.com/a/75087369/8508004
import os
import fnmatch
import glob
executed_scripts = []
extension_to_match = ".py"
def execute_script(s):
    %run {s}
while set(executed_scripts) != set(glob.glob(f"*{extension_to_match}")):
    for file in os.listdir('.'):
        if fnmatch.fnmatch(file, '*'+extension_to_match):
            if file not in executed_scripts:
                execute_script(file)
                executed_scripts.append(file)

# glob use to just get list of file pathnames like part of what is done just above:
import glob
name_part_to_match = "mito.gff3"
associated_files = glob.glob(f"*.{name_part_to_match}")

# recursive search for CSV files in Current directory OR sub directories /subdirectories
csv_files = glob.glob("**/*.csv", recursive=True)
# I didn't find the explanation of `**` in the Python documentation (https://docs.python.org/3/library/glob.html) very clear until I read
# https://www.geeksforgeeks.org/how-to-use-glob-function-to-find-files-recursively-in-python/ ,
# but for files with matches to an extension in current directory or sub directories, the example
# code in the documentation is more concise and results in easier to read and use paths.


# fnmatch use Example with checking for a file it will produce and then running if not there. It will unpack
# an multi-entry FASTA file into a single file for each entry and rename them all to have `.mito.fa` at end,
# leaving any original FASTA file there at the start ending in ".fa" alone.
import os
import sys
import fnmatch
example_produced_file = "NCYC3594.mito.fa"
if not os.path.isfile(example_produced_file): #so won't run again if already ran
    name_part_to_match = ".fa"
    name_part_to_expand_to = ".mito.fa"
    old_files_with_ext = []
    for file in os.listdir('.'):
        if fnmatch.fnmatch(file, '*'+name_part_to_match):
            old_files_with_ext.append(file)
    files_to_not_touch_despite_match = old_files_with_ext
    seq_file = "SGDs288CplusPacBio_ADJUSTEDplusWoltersnW303forALIGNERS.fa"
    !faidx --split-files {seq_file}
    new_fasta = []
    for file in os.listdir('.'):
        if fnmatch.fnmatch(file, '*'+name_part_to_match) and file not in files_to_not_touch_despite_match:
            new_fasta.append(file)
    #fix name if it needs fixing
    for file in new_fasta:
        if not fnmatch.fnmatch(file, '*'+name_part_to_expand_to):
            new_file_name = file.split(".fa")[0] + name_part_to_expand_to
            !mv {file} {new_file_name}

# fnmatch use Example with reading and modifying the current matched file
import fnmatch
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*mt.fsa'):
        print (file)
        # !blastn -query {file} -db chrmt.fsa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq" -out {file[:-9]}x.chrmt.comp.txt
        blast_result = !blastn -query {file} -db S288c.mt.genome.fa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq"
        blast_df = blast_to_df(blast_result.n, pickle_df=False)
        #...
        print(start_loc)
        with open(file) as handle:
            mito_seq = SeqIO.read(handle, "fasta")

        # fix, based on where it says "i.e. shift the starting point on this plasmid," @
        #http://biopython.org/DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html
        left = mito_seq[:start_loc] # use one less than what matches '1' in
        # those cases because of zero indexing in Python; gets handled by that
        # substraction above where `start_loc` defined
        right = mito_seq[start_loc:]
        adj_mito_seq = right + left

        # write result after fix
        sys.stderr.write("\n\nFile with adjusted 'start' saved as "
            "'{}'.".format(generate_new_name(file),))
        SeqIO.write(
            adj_mito_seq, generate_new_name(file), "fasta");

# Use fnmatch to skip if extension has more beyond what is searched and exclude a specific file
fn_to_check = "pep.fa"
sequences = ""
import os
import fnmatch
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*'+fn_to_check):
        if not file.endswith(".fai") and file != "DBVPG6044.mt.pep.fa":
            sequences += get_seq_from_multiFASTA_with_match_in_description(
                file,gene_to_match, return_record_as_string=True)


# Use fnmatch to change the names of files with specific extensions (in a subdirectory) to different extensions
#Change name of genome files from ending in `.fas` to ending in `.genome.fa` ; rename files, renaming files
genomes_dir = "genomes"
old_extension = ".fas"
new_extension = ".genome.fa"
import os
import fnmatch
for file in os.listdir(genomes_dirn):
    if fnmatch.fnmatch(file, '*'+ old_extension):
        !mv {genomes_dirn}/{file} {genomes_dirn}/{file.split(old_extension)[0]+new_extension}


# Package up a lot of various data sources and output streams (if trying to pack up
# just files, keep in mind using nbzip module or if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`)
python_data_to_grab = {
    'FILE_NAME_TO_GENERATE_A.md':(name_of_listA,"DESCRIPTION_TAG_to_put_on_top_line_of_file:"),
    'FILE_NAME_TO_GENERATE_B.md': (name_of_listB,"TAG_to_put_on_top_line_of_file:"),
    'FILE_NAME_TO_GENERATE_C.md':(out_stream.stderr,"TAG_to_put_on_top_line_of_file:"),
                    }
# PYTHON 2.7 VERSION
import sys
import contextlib
data_tag = "some_descriptive_string_here_about_info"
# fix for python 2 based on https://stackoverflow.com/a/44226422/8508004
@contextlib.contextmanager
def redirect_stdout(target):
    original = sys.stdout
    sys.stdout = target
    yield
    sys.stdout = original
for file_name_to_use,py_obj_info in python_data_to_grab.items():
    py_obj,data_name = py_obj_info
    with open(file_name_to_use, 'w') as f:
        with redirect_stdout(f):
            print(data_name + " =")
            print(str(py_obj))
# package up the files
!mkdir pertinent_data_for{set_designation}
for each_file in python_data_to_grab.keys():
    !mv {each_file} pertinent_data/.
!tar czf pertinent_data_for{set_designation}.tar.gz pertinent_data/
sys.stderr.write("Useful information for the set saved as "
                 "`pertinent_data_for_{}.tar.gz`".format(data_tag))
# IT'D BE BETTER TO incorporate `%store` in above, I think. see https://stackoverflow.com/a/32731418/8508004 & above here


# identify several files via file names and fnmatch and package up without placing in a directory first
# (see under 'Collecting all the results' in `Annotating mito sequences extracted from XXXX collection with MFannot and converting annotation file to gff3.ipynb` if need example with putting into directory first just using bash shell commands or
# search `!mv {each_file} pertinent_data/.` here for something similar)
archive_file_name = "annotations_for_four_putative_mitos_from332.tar.gz"
import os
import sys
import fnmatch
dl_files = []
name_part_to_match = ".new"
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*'+name_part_to_match):
        #print (file)
        #first_part_filen = file.rsplit(extension_to_handle,1)[0]
        dl_files.append(file)
!tar czf {archive_file_name} {" ".join(dl_files)}
sys.stderr.write("***************************DONE***********************************\n"
    "'{}' generated. Download it.\n"
    "***************************DONE***********************************".format(archive_file_name))

#Note that when I added in the use of the `--transform` flag into making a tar, it disrupted use of passing Python into shell commands and so even after consultig Claude and trying somethings
#I found easier to just hardcode in even if blatently redundant and breaking DRY, like so:
archive_file_name = "results_for_merged_set.tar.gz"
!tar czf {archive_file_name}  --transform 's/^\./results_for_merged_set/' {" ".join(list_of_files)}


#package up several files for download  (if trying to pack up just files, keep in mind using nbzip module as alternative if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`)
# make one file for downloading
archive_file_name = "collected_files.tar.gz"
dl_files = [x + "_tag.fa" for f in file_list]
!tar czf {archive_file_name} {" ".join(dl_files)}
sys.stderr.write("*****************DONE***********************************\n"
    "'{}' generated. Download it.\n"
    "*****************DONE***********************************".format(archive_file_name))


#package up several files and files made from captured output stream for download, where  (keep in mind using nbzip module as alternative if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`)
# make one file for downloading
archive_file_name = "collected_files.tar.gz"
dl_files = [x + "_tag.fa" for f in file_list]
# save & add the additional information files to collect
output_txt_filename_a = 'seqs_filtered_info.txt'
output_txt_filename_b = 'seqs_filtered.txt'
%store captured_stream_a.stderr >{output_txt_filename_a}  #based on https://stackoverflow.com/a/32731418/8508004
%store captured_stream_b.stdout >{output_txt_filename_b}  #based on https://stackoverflow.com/a/32731418/8508004
dl_files += [output_txt_filename_a, output_txt_filename_b] # or if really only one, `dl_files.append(filtered_out)`
!tar czf {archive_file_name} {" ".join(dl_files)}
sys.stderr.write("*****************DONE***********************************\n"
    "{} generated. Download it.\n"
    "*****************DONE***********************************".format(archive_file_name))

# note based on https://stackoverflow.com/a/32731418/8508004, used `%store` to replace above
with open(output_txt_filename_a, 'w') as output_handler:
   output_handler.write(captured_stream_a.stderr)
with open(output_txt_filename_b, 'w') as output_handler:
   output_handler.write(captured_stream_b.stdout)

# Package up several dataframes and sequences
#Archive the CTD sequences (FASTA format) collected and any dataframes made
# Pickle each dataframe and also save as `tsv` for possible use elsewhere
strd_dataframes_fn_list = []

def pickle_df_and_store_as_table(dataframe, prefix):
    '''
    Take a dataframe and a filename prefix and save a pickled form of that
    dataframe and a text tablular data version (tab-sepearated values).

    Returns the name of the pickled and text file.
    '''
    dataframe.to_pickle(prefix + ".pkl")
    dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
    return prefix + ".pkl", prefix + ".tsv"
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
# associated as the value
df_n_fn_dict = {
    "CTD_seq_of_protein_orthologs": CTD_seq_df,
    "first_heptad_of_protein_orthologs": first_7_df,
    "heptads_ofCTD_seq_of_protein_orthologs": repeat_df,
    "main_heptads_ofCTD_seq_of_protein_orthologs": repeat_wo_first_df,
    "fraction_matching_consensus_per_CTD": fraction_consensus_df,
}
import pandas as pd
for prefix, dataframe in df_n_fn_dict.items():
    #pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
    strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))

# store `CTD_seqs_fn_list` as json since lighter-weight and more portable than pickling
CTD_seqs_fn_list_storedfn = "CTD_seqs_fn_list.json"
import json
with open(CTD_seqs_fn_list_storedfn, 'w') as f:
    json.dump(CTD_seqs_fn_list, f)
# see my useful python snippets for reading json back in

#for ease in aligning or other uses later save the all the CTDs as a concatenated file
cat_fasta_fn = "CTD_seq_of_protein_orthologs.fa"
# !cat {" ".join(CTD_seqs_fn_list)} > {cat_fasta_fn} # faster but not as good as awk if files don't already have newlines at end;
# just results in the lines of the files as one long run on that won't work for aligning
!awk 1 {" ".join(ortholog_prot_seqs)} > {cat_fasta_fn} #based on https://stackoverflow.com/a/25030513/8508004


archiving_fn_list = CTD_seqs_fn_list + strd_dataframes_fn_list + [CTD_seqs_fn_list_storedfn , cat_fasta_fn]
archive_file_name = gene_name+"_orthologs_extracted_CTDs.tar.gz"
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
sys.stderr.write("\nCollected CTD sequences"
                 " and tables of details gathered and saved as "
                 "`{}`.".format(archive_file_name))


# Package up several dataframes and a list of genomes  (see just below for only with dataframes)
# Pickle each dataframe and also save as `tsv` for possible use elsewhere
strd_dataframes_fn_list = []

# store `genomes` as json since lighter-weight and more portable than pickling
# for easy json dumping for many list use when archiving:
file_names_for_lists_dict = {
    "genomes_list.json":genomes,
}
import json
for fn, lizt in file_names_for_lists_dict.items():
    with open(fn, 'w') as f:
        json.dump(lizt, f)

def pickle_df_and_store_as_table(dataframe, prefix):
    '''
    Take a dataframe and a filename prefix and save a pickled form of that
    dataframe and a text tablular data version (tab-sepearated values).

    Returns the name of the pickled and text file.
    '''
    dataframe.to_pickle(prefix + ".pkl")
    dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
    return prefix + ".pkl", prefix + ".tsv"
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
# associated as the value
df_n_fn_dict = {
    "mito_promoter_matches_df": df,
    "mito_promoter_hit_num_tallies_by_id_df": largest_hit_num_by_id_df,
    "disruptor_matches_df": disrupt_df,
    "disruptor_hit_num_tallies_by_id_df": largest_disr_num_by_id_df,
    "grich_matches_df": grich_df,
    "grich_hit_num_tallies_by_id_df": largest_grich_num_by_id_df,
    "endgrich_matches_df": end_grich_df,
    "endgrich_hit_num_tallies_by_id_df": largest_endgrich_num_by_id_df,
    "twenty_nineATrich_seq_matches_df": twenty_nine_df,
    "twenty_nineATrich_seq_hit_num_tallies_by_id_df": largest_ATrich_num_by_id_df,
}
import pandas as pd
for prefix, dataframe in df_n_fn_dict.items():
    #pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
    strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))


archiving_fn_list = strd_dataframes_fn_list + list(file_names_for_lists_dict.keys())
archive_file_name = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz"
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
sys.stderr.write("\nCollected dataframes"
                 " and tables of details gathered and saved as "
                 "`{}`.".format(archive_file_name))


# for archiving just several dataframes with automated handling
archive_file_name = "dataframes_archived.tar.gz"
strd_dataframes_fn_list = []


def pickle_df_and_store_as_table(dataframe, prefix):
    '''
    Take a dataframe and a filename prefix and save a pickled form of that
    dataframe and a text tablular data version (tab-sepearated values).

    Returns the name of the pickled and text file.
    '''
    dataframe.to_pickle(prefix + ".pkl")
    dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
    return prefix + ".pkl", prefix + ".tsv"
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
# associated as the value
df_n_fn_dict = {
    "df": df,
    "another_df": another_df,
    "yet_another_df": yet_another_df,
}
import pandas as pd
for prefix, dataframe in df_n_fn_dict.items():
    #pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
    strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))

archiving_fn_list = strd_dataframes_fn_list
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
sys.stderr.write("\nCollected dataframes"
                 " gathered and saved as "
                 "`{}`.".format(archive_file_name))


# for easy json dumping for many list use when archiving:
file_names_for_lists_dict = {
    "annotation_fns.json":annot_fns,
    "genome_fnss.json":genomes_for_anot_fns,
    "fn_pairings.json":file_pairs,
}
import json
for fn, lizt in file_names_for_lists_dict.items():
    with open(fn, 'w') as f:
        json.dump(lizt, f)


# Use curl to get an FASTA file from OCA and remove html tags (may need `!pip install BS4` first)
# Get FASTA file for the non yeast one
import os
#!curl -o 1x0t_A.fa http://oca.weizmann.ac.il/oca-bin/send-seq?1x0t_A
os.system("curl -o 1x0t_A.fa http://oca.weizmann.ac.il/oca-bin/send-seq?1x0t_A")
# remove HTML to leave actual FASTA
# based on https://stackoverflow.com/a/21577649/8508004 and https://unix.stackexchange.com/a/64747
import sys
from bs4 import BeautifulSoup
oca_file_to_fix = "1x0t_A.fa"
soup = BeautifulSoup(open(oca_file_to_fix), "html.parser")
for pre in soup.findAll("pre"):
    fasta =  pre.contents
%store fasta[0] >{oca_file_to_fix}


# NOTE ABOUT THE READING PART OF THIS NEXT BLOCK: seems more modern Pythonic way
# is to leave out the `,'r'` part. See https://stackabuse.com/read-a-file-line-by-line-in-python/ under
# 'Read a File Line-by-Line with a for Loop - Most Pythonic Approach'. Note also that
# best to use `.strip()` or possibly slice `[:-1]` to remove line ending if going to
# rearrange because can get weird merge if alter order because usually last line will not have a new
# line character.

# add identifiers to each `chr` so results for each strain clear later
chromosome_id_prefix = "chr"
def add_strain_id_to_description_line(file,strain_id):
    '''
    Takes a file and edits every description line to add
    strain_id after the caret.

    Saves the fixed file
    '''
    import sys
    output_file_name = "temp.txt"
    # prepare output file for saving so it will be open and ready
    with open(output_file_name, 'w') as output_file:

        # read in the input file
        with open(file, 'r') as input_handler:  # OR SEE NOTE ABOVE THIS CODE BLOCK HOW DON'T NEED `, 'r'` anymore.
            # prepare to give feeback later or allow skipping to certain start
            lines_processed = 0

            for line in input_handler:
                lines_processed += 1
                if line.startswith(">"):
                    rest_o_line = line.split(">")
                    new_line = ">"+strain_id + rest_o_line[1]
                else:
                    new_line = line

                # Send text to output
                output_file.write(new_line)


    # replace the original file with edited
    !mv temp.txt {file}
    # Feedback
    sys.stderr.write("\n{} chromosome identifiers tagged.".format(file))

for s in yue_et_al_strains:
    add_strain_id_to_description_line(s+".genome.fa",s)

# A find / replace similar to last example but pure Python (no-IPython magics or shell use)
# (See `testing_repeat_number_by_looping_bendit_analysis.ipynb` for practical use of this to change a script on a loop to monitor effect on outcome)
script_name = "donut_plot_with_subgroups_from_dataframe.py"
def change_original_title(s):
    '''
    Change the plot title to the provided text.
    '''
    with open(script_name, 'r') as thefile:
        script=thefile.read()
    script = script.replace('BREAKDOWN', s)
    with open(script_name, 'w') as output_file:
        output_file.write(script)
change_original_title("NEW TITLE GOES HERE")
# Note for making substituons, Python now allows you to use f-strings (formatted string literals ) substitute
# variables into strings by name, but also Python strings have 'Template strings' built in (Ex. `import string; t = string.Template('Hello, $name!'); print(t.substitute(name='World'))`

# Collect list of image files in a directory
# Run this in notebook that is in directory along with the folder containing
# images, i.e., is in the level above the actual images
import os
import sys
try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path


img_folder = "Untitled Folder"

img_file_extensions = [".png",".jpg",".jpeg"]
list_imgs_in_directory = []
for file in os.listdir(img_folder):
    #print (file)
    if Path(file).suffix in img_file_extensions:
        list_imgs_in_directory.append(file)
len(list_imgs_in_directory)


#Pathlib in Python 2 or 3 example:
try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path
# list all files in a directory
[item for item in Path('.').glob('*')] # based on
# https://jefftriplett.com/2017/pathlib-is-wonderful/
# list final file extension , see 'Path.suffix' at
#https://docs.python.org/3/library/pathlib.html
[item.suffix for item in Path('.').glob('*')]
# list the final suffixes if there is more than one - see 'Path.suffixes' at
#https://docs.python.org/3/library/pathlib.html


# Collect list of image files in a directory and display them in a Jupyter
# notebook cell
# Run this in notebook that is in directory along with the folder containing
# images, i.e., is in the level above the actual images
import os
import sys
try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path
from IPython.display import Image
from IPython.display import display

img_folder = "Untitled Folder"

img_file_extensions = [".png",".jpg",".jpeg"]
list_imgs = []
for file in os.listdir(img_folder):
    #print (file)
    if Path(file).suffix in img_file_extensions:
        list_imgs.append(Path(img_folder,file))
imgl = [Image(filename=str(x)) for x in list_imgs] #had to cast the
# path object to a string or else `display.py` was giving error
# `'PosixPath' object has no attribute 'split'`;seems `display.py` not able to
# handle path objects yet.
display(*imgl)


# Collect list of image files in a directory and display them in a Jupyter
# notebook cell WITH FILE NAMES SHOWN BELOW EACH
# Run this in notebook that is in directory along with the folder containing
# images, i.e., is in the level above the actual images
import os
import sys
try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path
from IPython.display import Image
from IPython.display import display

img_folder = "Untitled Folder"

img_file_extensions = [".png",".jpg",".jpeg"]
list_imgs = []
for file in os.listdir(img_folder):
    #print (file)
    if Path(file).suffix in img_file_extensions:
        list_imgs.append(Path(img_folder,file))
for i in list_imgs:
    display(Image(filename=str(i)))
    print("ABOVE: {}".format(i.name))


#slide carousel-like example to show a subset of images that changes every five seconds (from `demo_palette.ipynb` in pymol-binder) with HTML labels for each image to make the text stand out:
import IPython.display as ipd
import time
import os
import sys
import random
def display_subset():
    img = {}
    for x in random.sample(range(shuffles_to_do), 3):
        img[x] = ipd.Image(filename="img_{}.png".format(x))
        ipd.display(img[x])
        ipd.display(ipd.HTML('ABOVE:&nbsp;<font size=5><b>img_{}.png</b></font>'.format(x)))
    time.sleep(5)
    ipd.clear_output(wait=True)
while True:
    display_subset()


# Subset / restrict to a random sampling of items in a list , based on https://pynative.com/python-random-sample/
# Good for doing right before EVEYRTHING GETS PROCESSED to pick a subset for testing, instead
# of defining specifically
import random
genomes = random.sample(population=genomes, k=15)


# Run a function every 8 minutes
%load https://gist.githubusercontent.com/fomightez/b012e51ebef6ec58c1515df3ee0c850a/raw/300da6c67ceeaf5384a3e500648b993345c361cb/run_every_eight_mins.py


# RELOAD for when you are using `from python_file_containing_function import a_function` (Python 3)
# Reload a function into a notebook after editing the script file in editor of running session;
# this allows calling the function in the notebook whereas if just reload the script won't
import importlib
import python_file_containing_function; importlib.reload(python_file_containing_function); from python_file_containing_function import a_function
# above line from https://stackoverflow.com/a/11724154/8508004

# RELOAD for when you are using `import python_file` (Python 3)
# Reload a script into a notebook after editing the script file in editor of running session;
# note it is much more easily done then the case where using `from foo import foo`, but
# `from foo import foo` makes it easier to work in a notebook in many ways.
import importlib;importlib.reload(import python_file)

# Create a download link in Jupyter notebook; from
# https://medium.com/ibm-data-science-experience/how-to-upload-download-files-to-from-notebook-in-my-local-machine-6a4e65a15767
# <-- Haven't tried it yet but it might be handy
# for idea I am working on for making animations from pymol files using jmol or any where where I suggest
# downloading an archive of results
from IPython.display import HTML

def create_download_link( df, title = "Download CSV file", filename = "data.csv"):
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(df)

# For handling archive files to make a clickable download link, I found the section 'Create and download CSV/zip file' at https://blog.softhints.com/jupyter-ipython-download-files/ ;  however, the code seems incomplete as I don't see how they make the zip file in conjunction with the sending it through as payload. (I assume `create_download_files()` was triggered elsewhere already.) And minor thing too, why not returning `HTML(html)` in that code block?
# Maybe Some of the answers here might help me reverse that Zipfile approach so it works to download to local?
# https://stackoverflow.com/questions/5710867/downloading-and-unzipping-a-zip-file-without-writing-to-disk

# Related to the topic of making downloadable links from Jupyter pages, I found https://stackoverflow.com/questions/26497912/trigger-file-download-within-ipython-notebook
# and
# https://stackoverflow.com/questions/24437661/retrieving-files-from-remote-ipython-notebook-server/24439480#24439480 about
# FileLink / FileLinks; however, in JupyterLab if it is a gif or png that JupyterLab renders, it opens it in the application
# instead of allowing download. And if it is a tarball that it doesn't render and you click on it, instead of offering to download
# it sats it isn't UTF-8 encoded.
# Fortunately when in Voila apps, you can list the files with the following:
from IPython.display import FileLink, FileLinks
    FileLinks(".")
# And in VOILA those can be right clicked on and downloaded to local drive from those links using `Save link as..`.
# Howeve, a better, relatedsolution for in Voila because it makes a pop-up automatically without needing user to use `Save as` is:
%%html
<a href="SVM_Confusion_Matrix.jpg" download="SVM_Confusion_Matrix.jpg">Click HERE to Download SVM image</a>
# Using Panel (installable via pip) in a notebook (NOT VOILA) you can make a download file, too:
import panel as pn
pn.extension()
# Create option to download SVM Confusion Matrix Graphic
pn.widgets.FileDownload(
            file="SVM_Confusion_Matrix.jpg",
            embed=False,
            name="Save SVM Confusion Matrix image"
)