Skip to content

Instantly share code, notes, and snippets.

@fomightez
Last active November 16, 2023 18:12
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fomightez/971d0958510c4c1a4734eafe4afa0cac to your computer and use it in GitHub Desktop.
Save fomightez/971d0958510c4c1a4734eafe4afa0cac to your computer and use it in GitHub Desktop.
Useful snippets for Jupyter notebooks
# Use `%%capture` to hush 'noisy' stdout and stderr streams, but still combine with getting `%%time` after
%%capture out_stream
%%time
---rest of a cell that does something with LOTS of output--
#In cell after, put following to get time of completion from that:
#time it took to run cell above
for x in out_stream.stdout.split("\n")[-3:]:
print(x)
# Use `%%capture` to hush 'noisy' stdout and stderr streams, but still get certain output after
%%capture out_stream
---rest of a cell that does something with LOTS of output with anything to keep tagged with `#x#x#x#x#x` at start of every line --
#In cell after, put following to get filtered output:
# output specifically tagged stderr lines captured from above cell
tag_used = "#x#x#x#x#x"
filtered_out = ""
for x in out_stream.stderr.split("\n"):
if x.startswith(tag_used):
filtered_out += x[len(tag_used):]+"\n"
# Feedback
sys.stderr.write("{}".format(filtered_out))
# Use `%%capture` to capture' stdout and stderr streams and send output to a file
%%capture out_stream
---rest of a cell that does something with output--
#In cell after, put following:
%store out_stream.stdout >output_from_cell.txt #based on https://stackoverflow.com/a/32731418/8508004
# (In an answer to a Jupyter Discourse post, I added more background on using this and options for how
# you could add showing the captured text in the notebook, too. See
# https://discourse.jupyter.org/t/how-to-write-the-output-from-previous-cell-to-a-csv-file/10319/2?u=fomightez )
# Use `with io.capture_output() as captured:` to suppress output from only what is in the `with` block
# This comes from down below on that same page as the `%%capture` cell magic (https://stackoverflow.com/a/52559560/8508004)
from IPython.utils import io
with io.capture_output() as captured:
MyFunction()
# Use %store and doscstring to write multi-line text to file
s='''#!/bin/bash
pdb=$1
for chain in $(grep "^ATOM" $pdb | cut -b 22 | sort -u)
do
sed -n "/^.\{21\}$chain/p" $pdb > ${pdb%.pdb}_$chain.pdb
done'''
%store s >split_into_chains.sh
# clean out directory of all but one file
from shlex import quote
pathname_of_file_to_keep = quote("notebooks/Generating later Circos tutorial notebooks from extracted markdown via notedown and papermill.ipynb")
name_of_file_to_keep = quote("Generating later Circos tutorial notebooks from extracted markdown via notedown and papermill.ipynb")
# based on Olivier Dulac's comment at https://unix.stackexchange.com/questions/153862/remove-all-files-directories-except-for-one-file
%cd ..
!cp $pathname_of_file_to_keep .
!rm -rf notebooks
!mkdir notebooks
!mv $name_of_file_to_keep notebooks/
%cd notebooks
# Make a directory if it doesn't already exist
# Make a folder if it doesn't already exist
import os
directory_for_archive = "original_html"
if not os.path.isdir(directory_for_archive):
!mkdir {directory_for_archive}
# Get a file if not yet retrieved / check if file exists
import os
file_needed = "get_seq_from_multiFASTA_with_match_in_description.py"
if not os.path.isfile(file_needed):
#!curl -OL https://raw.githubusercontent.com/fomightez/sequencework/master/Extract_from_FASTA/{file_needed}
os.system(f"curl -OL https://raw.githubusercontent.com/fomightez/sequencework/master/Extract_from_FASTA/{file_needed}")
# Get a list of files if not yet retrieved, checking if file exists already first
import os
files_needed = ["hhsearch_q9bsu1_uniclust_w_ss_pfamA_30.hhr",
"2uvo_hhblits.hhr",
"2uvo_hhsearch.hhr",
"hhpred_9590198.hhr"]
url_prefix = "https://raw.githubusercontent.com/biopython/biopython/master/Tests/HHsuite/"
for file_needed in files_needed:
if not os.path.isfile(file_needed):
!curl -OL {url_prefix+file_needed}
# Check if list of file already uploaded and if not, prompt for them
import os
import sys
files_needed = ["genome_1.fa","genome_2.fa","genome_3.fa","genome_4.fa",]
files_still_needed = []
for fn in files_needed:
if not os.path.isfile(fn):
files_still_needed.append(fn)
if files_still_needed:
sys.stderr.write("\nThe following sequences still need uploading:\n - {}\n".format("\n - ".join(files_still_needed)))
sys.exit(1)
else:
sys.stderr.write("\nSequences needed all present.")
# Check for a file that is in an archive and then ask for archive if file not there
# and don't find the archive. Particularly useful in Binder sessions to make sure
# needed files are around and ready to run cells. HALTS NOTEBOOK CELL PROCESSING IF NOT.
# first check `an_archive_example.tar.gz` uploaded if it wasn't already extracted
import os
unpacked_example = os.path.join("directory_containing_file_when_unpacked","your_alignment_file.clustal")
file_needed = "an_archive_example.tar.gz"
import sys
if os.path.isfile(unpacked_example):
sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed))
elif os.path.isfile(file_needed):
!tar xzf {file_needed}
else:
sys.stderr.write("\n\n*****************ERROR**************************\n"
"The file '{0}' is needed.\n"
"Upload '{0}' to this Jupyter session and re-run this cell.\n"
"*****************ERROR**************************\n".format(file_needed))
sys.exit(1)
# Ask for an archive and unpack and extract enclosed dataframe
file_required = "collected_candidate_21S-containing_seqs.tar.gz"
dataframe_to_read = "extracted_21S-containing_seq_info_df.pkl"
import os
import sys
import pandas as pd
if os.path.isfile(file_required):
!tar -xzf {file_required}
mitolsu_frag_df = pd.read_pickle(dataframe_to_read)
sys.stderr.write("\nFile with sequences ('{}') observed and"
" unpacked.".format(file_required))
sys.stderr.write("\nDataframe '{}' read in"
".".format(dataframe_to_read))
else:
sys.stderr.write("\nFile with sequences '{}' not seen and so nothing done"
". Seems wrong.".format(file_required))
sys.exit(1)
# for when that archive to check for contains a dataframe to bring into memory in the notebook:
import os
file_needed = "an_archive_example.tar.gz"
unpacked_goal = "info_df.pkl"
import sys
import pandas as pd
if os.path.isfile(unpacked_goal):
sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed))
#bring the details in
try:
len(previous_details_df) > 2
except NameError as e:
previous_details_df = pd.read_pickle(unpacked_goal)
sys.stderr.write("\nData in '{}' read in.\n".format(unpacked_goal))
elif os.path.isfile(file_needed):
!tar xzf {file_needed}
previous_details_df = pd.read_pickle(unpacked_goal)
sys.stderr.write("\nData in '{}' read in.\n".format(unpacked_goal))
else:
sys.stderr.write("\n\n*****************ERROR**************************\n"
"The file '{0}' is needed.\n"
"Upload '{0}' to this Jupyter session and re-run this cell.\n"
"*****************ERROR**************************\n".format(file_needed))
sys.exit(1)
# for when that archive to check for contains several dataframes and a list to bring into memory in the notebook (SEE JUST BELOW FOR MORE GENERAL / ONLY DATAFRAMES):
import os
file_needed = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz"
unpacked_goal = "disruptor_hit_num_tallies_by_id_df.pkl"
df_n_fnstr_dict = {
"largest_disr_num_by_id_df": "largest_disr_num_by_id_df",
"mito_promoter_matches_df": "df",
"mito_promoter_hit_num_tallies_by_id_df": "largest_hit_num_by_id_df",
"disruptor_matches_df": "disrupt_df",
"disruptor_hit_num_tallies_by_id_df": "largest_disr_num_by_id_df",
"grich_matches_df": "grich_df",
"grich_hit_num_tallies_by_id_df": "largest_grich_num_by_id_df",
"endgrich_matches_df": "end_grich_df",
"endgrich_hit_num_tallies_by_id_df": "largest_endgrich_num_by_id_df",
"twenty_nineATrich_seq_matches_df": "twenty_nine_df",
"twenty_nineATrich_seq_hit_num_tallies_by_id_df": "largest_ATrich_num_by_id_df",
}
def read_in_data(df_n_fnstr_dict):
#df_fns = ["{}.pkl".format(x) for x in df_n_fnstr_dict.keys()]
df_n_fnstr_dict = {"{}.pkl".format(k):v for k,v in df_n_fnstr_dict.items()}
g = globals() #based on `how to use a string to make a python variable.md`
for k,v in df_n_fnstr_dict.items():
g[v] = pd.read_pickle(k)
sys.stderr.write("\nData in '{}' read in; produced `{}`.".format(k,v))
import json
with open('genomes_list.json', 'r') as f:
g["genomes"] = json.load(f)
sys.stderr.write("\nGenomes list read back in as `genomes`.")
import sys
import pandas as pd
if os.path.isfile(unpacked_goal):
sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed))
#bring the data into memory, if it isn't already
try:
len(globals()[list(df_n_fnstr_dict.items())[0][1]]) > 2
except (NameError,KeyError) as e:
read_in_data(df_n_fnstr_dict)
elif os.path.isfile(file_needed):
!tar xzf {file_needed}
read_in_data(df_n_fnstr_dict)
else:
sys.stderr.write("\n\n*****************ERROR**************************\n"
"The file '{0}' is needed.\n"
"Upload '{0}' to this Jupyter session and re-run this cell.\n"
"*****************ERROR**************************\n".format(file_needed))
sys.exit(1)
## MORE GENERAL VERSION OF THAT LAST ONE THAT DOESN'T INCLUDE ANY LIST TO READ IN
import os
file_needed = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz"
unpacked_goal = "disruptor_hit_num_tallies_by_id_df.pkl"
df_n_fnstr_dict = {
"largest_disr_num_by_id_df": "largest_disr_num_by_id_df",
"mito_promoter_matches_df": "df",
"mito_promoter_hit_num_tallies_by_id_df": "largest_hit_num_by_id_df",
"disruptor_matches_df": "disrupt_df",
"disruptor_hit_num_tallies_by_id_df": "largest_disr_num_by_id_df",
"grich_matches_df": "grich_df",
"grich_hit_num_tallies_by_id_df": "largest_grich_num_by_id_df",
"endgrich_matches_df": "end_grich_df",
"endgrich_hit_num_tallies_by_id_df": "largest_endgrich_num_by_id_df",
"twenty_nineATrich_seq_matches_df": "twenty_nine_df",
"twenty_nineATrich_seq_hit_num_tallies_by_id_df": "largest_ATrich_num_by_id_df",
}
def read_in_pickles(df_n_fnstr_dict):
#df_fns = ["{}.pkl".format(x) for x in df_n_fnstr_dict.keys()]
df_n_fnstr_dict = {"{}.pkl".format(k):v for k,v in df_n_fnstr_dict.items()}
g = globals() #based on `how to use a string to make a python variable.md`
for k,v in df_n_fnstr_dict.items():
g[v] = pd.read_pickle(k)
sys.stderr.write("\nData in '{}' read in; produced `{}`.".format(k,v))
import sys
import pandas as pd
if os.path.isfile(unpacked_goal):
sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed))
#bring the data into memory, if it isn't already
try:
len(globals()[list(df_n_fnstr_dict.items())[0][1]]) > 2
except (NameError,KeyError) as e:
read_in_pickles(df_n_fnstr_dict)
elif os.path.isfile(file_needed):
!tar xzf {file_needed}
read_in_pickles(df_n_fnstr_dict)
else:
sys.stderr.write("\n\n*****************ERROR**************************\n"
"The file '{0}' is needed.\n"
"Upload '{0}' to this Jupyter session and re-run this cell.\n"
"*****************ERROR**************************\n".format(file_needed))
sys.exit(1)
# check single file uploaded
file_required = "collected_seqs.tar.gz" # usually in another cell
import os
import sys
try:
os.path.isfile(file_required)
except NameError:
file_required = "collected_seqs.tar.gz"
if os.path.isfile(file_required):
!tar -xzf collected_seqs.tar.gz
!mv collected_seqs/* .
!rm -rf collected__seqs
sys.stderr.write("\nFile with sequences ('{}') observed and"
" unpacked.".format(file_required))
else:
sys.stderr.write("\nFile with sequences '{}' not seen and so nothing done"
". Seems wrong.".format(file_required))
sys.exit(1)
# Check single file uploaded with check on size
file_required = ""0_332yeast_genomesFROMshenETal2018.zip" # usually in another cell
size_expected = 2.902e+09 # in bytes # usually in another cell
# Upload the file prior to running this cell
import os
import sys
try:
os.path.isfile(file_required)
except NameError:
file_required = "0_332yeast_genomesFROMshenETal2018.zip"
if os.path.isfile(file_required):
# make sure it is large as it should be since it takes so long to upload
f_size = os.path.getsize(file_required) # based on https://stackoverflow.com/a/2104083/8508004
if f_size >= size_expected:
!mkdir genomes
!unzip -q 0_332yeast_genomesFROMshenETal2018.zip
!unzip -q 0_332yeast_genomes/332_genome_assemblies.zip
!mv *.fas genomes/.
sys.stderr.write("\nGenomes archive ('{}') observed and"
" unpacked.".format(file_required))
else:
sys.stderr.write("\nGenomes archive ('{}') observed but is not"
" fully uploaded\nWait and run this cell again.".format(file_required))
else:
sys.stderr.write("\nGenomes archive '{}' not seen and so nothing done"
". Seems wrong.".format(file_required))
sys.exit(1)
#someone else's take on some of these concepts is in post at https://twitter.com/radekosmulski/status/1129116929589940232
# check multiple files uploaded
import os
import sys
import pandas as pd
try:
type(files_required)
except NameError:
print("Setting `files_required`")
files_required = ["PB_n_1011_collection_df.pkl","other_all_stretchesN_df.pkl"]
for file_required in files_required:
if os.path.isfile(file_required):
if file_required == files_required[0]:
all_df = pd.read_pickle(file_required)
else:
other_df = pd.read_pickle(file_required)
sys.stderr.write("\nFile '{}' observed and"
" unpickled.".format(file_required))
else:
sys.stderr.write("\nFile'{}' not seen and so nothing done"
".\nSeems wrong!??!\n\n".format(file_required))
sys.exit(1)
#Check if a large remote archive already unpacked and retrieved. If not,
# take care of whatever is left to do to use result. (For example, if the directory was
# set up via Cyverse to already have the archive, no need to retrieve it now
# but want to unpack it.
import os
import sys
archive_fn = "1011Assemblies.tar.gz"
archive_url = "http://1002genomes.u-strasbg.fr/files/1011Assemblies.tar.gz"
num_files_in_archive = 1011
genomes_dir = 'GENOMES_ASSEMBLED'
expected_unpacked_fn = genomes_dir+"/"+"YBV.re.fa"
def unpack_and_delete_lrg_archive(archive_fn):
!tar -xzf {archive_fn}
if len(os.listdir(genomes_dir)) >= num_files_in_archive:
!rm {archive_fn}
sys.stderr.write("\nFile with genomes ('{}') observed and"
" unpacked.".format(archive_fn))
if os.path.isfile(expected_unpacked_fn):
sys.stderr.write("\n**Nothing Done. Genomes from '{}' already obtained &"
" unpacked.**".format(archive_fn))
else:
if os.path.isfile(
archive_fn) and not os.path.isfile(expected_unpacked_fn):
unpack_and_delete_lrg_archive(archive_fn)
if not os.path.isfile(
archive_fn) and not os.path.isfile(expected_unpacked_fn):
sys.stderr.write("\nGenome sequences not seen, and so obtaining"
" '{}'".format(archive_fn))
#!curl -O {archive_url}
os.system(f"curl -O {archive_url}")
unpack_and_delete_lrg_archive(archive_fn)
else:
sys.stderr.write("\nSomething seems wrong.")
sys.exit(1)
# Manage files with `fnmatch` (see just above about whether file uploaded, too)
# Basic fnmatch use
import fnmatch
for file in os.listdir(genomes_dir):
if fnmatch.fnmatch(file, '*.re.fa'):
!perl patmatch_1.2/unjustify_fasta.pl {genomes_dir}/{file}
#os.remove(os.path.join(genomes_dir, file)) #left over from development
output = !perl patmatch_1.2/patmatch.pl -c {promoter_pattern} {genomes_dir}/{file}.prepared
os.remove(os.path.join(genomes_dir, file+".prepared")) #delete file made for PatMatch
df = patmatch_results_to_df(output.n, pattern=promoter_pattern, name="promoter")
# more fnmatch basic use
tag_to_add ="1G03"
import os
import sys
import fnmatch
model_pattern = "model_*.pdb"
for file in os.listdir('.'):
if fnmatch.fnmatch(file, model_pattern):
os.rename(file, tag_to_add + file)
# fnmatch use combined with checking id related file yet exists
# categorize those annotated already and those missed
import os
import sys
import fnmatch
extension_to_check = ".fa"
extension_to_see_if_exists = ".new"
num_checked = 0
not_annotated = []
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*'+extension_to_check):
num_checked += 1
#print (file)
first_part_filen = file.rsplit(extension_to_check,1)[0]
# check if corresponding `.new` file exists
annotated_file = file+extension_to_see_if_exists
#print(annotated_file)
if os.path.isfile(annotated_file):
pass
else:
not_annotated.append(file)
print ("No {} file?".format(annotated_file))
# Feedback
sys.stderr.write("{} sequences files checked; {} lack corresponding, "
"\nannotated `.new` files.".format(num_checked,len(not_annotated)))
sys.stderr.write("\nThe variable `not_annotated` lists the sequences missing annotated files.")
#fnmatch to make a list of files and then do something with related files (see below how to use `glob.glob()` if just needed a list and
# weren't doing something with the names of files as encountered) (I added an example considering both when do need to iterate on many
# files and rename at https://discourse.jupyter.org/t/rename-files-using-a-for/17144/2?u=fomightez )(Example with using glob or fnmatch to get base file names is at https://www.biostars.org/p/9539595/#9548023 ; keep in mind in relation to basename is .stem from Path, see https://stackoverflow.com/a/47496703/8508004)
import os
import sys
import fnmatch
extension_to_handle = ".gff3"
name_part_to_match = "mito.gff3"
associated_mito_noms= []
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*'+name_part_to_match):
#print (file)
first_part_filen = file.rsplit(extension_to_handle,1)[0]
associated_mito_noms.append(first_part_filen)
# Now delete any files that end in `mito.fa` that are not in the list of the annotation files
extension_to_handle = ".fa"
name_part_to_match = "_mito.fa"
removed = 0
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*'+name_part_to_match):
first_part_filen = file.rsplit(extension_to_handle,1)[0]
if (first_part_filen) not in associated_mito_noms:
os.remove(file)
removed += 1
sys.stderr.write("\n{} files ending in `{}` removed"
".".format(removed,name_part_to_match))
# use fnmatch and glob in a notebook to iterate on all `.py` Python script files in a directory and run them. Even subsequent ones made by the
# scripts 'dynamically' in the course of running (was to answer a StackOverflow question, see https://stackoverflow.com/a/75087369/8508004
import os
import fnmatch
import glob
executed_scripts = []
extension_to_match = ".py"
def execute_script(s):
%run {s}
while set(executed_scripts) != set(glob.glob(f"*{extension_to_match}")):
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*'+extension_to_match):
if file not in executed_scripts:
execute_script(file)
executed_scripts.append(file)
# glob use to just get list of file pathnames like part of what is done just above:
import glob
name_part_to_match = "mito.gff3"
associated_files = glob.glob(f"*.{name_part_to_match}")
# recursive search for CSV files in Current directory OR sub directories /subdirectories
csv_files = glob.glob("**/*.csv", recursive=True)
# I didn't find the explanation of `**` in the Python documentation (https://docs.python.org/3/library/glob.html) very clear until I read
# https://www.geeksforgeeks.org/how-to-use-glob-function-to-find-files-recursively-in-python/ ,
# but for files with matches to an extension in current directory or sub directories, the example
# code in the documentation is more concise and results in easier to read and use paths.
# fnmatch use Example with checking for a file it will produce and then running if not there. It will unpack
# an multi-entry FASTA file into a single file for each entry and rename them all to have `.mito.fa` at end,
# leaving any original FASTA file there at the start ending in ".fa" alone.
import os
import sys
import fnmatch
example_produced_file = "NCYC3594.mito.fa"
if not os.path.isfile(example_produced_file): #so won't run again if already ran
name_part_to_match = ".fa"
name_part_to_expand_to = ".mito.fa"
old_files_with_ext = []
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*'+name_part_to_match):
old_files_with_ext.append(file)
files_to_not_touch_despite_match = old_files_with_ext
seq_file = "SGDs288CplusPacBio_ADJUSTEDplusWoltersnW303forALIGNERS.fa"
!faidx --split-files {seq_file}
new_fasta = []
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*'+name_part_to_match) and file not in files_to_not_touch_despite_match:
new_fasta.append(file)
#fix name if it needs fixing
for file in new_fasta:
if not fnmatch.fnmatch(file, '*'+name_part_to_expand_to):
new_file_name = file.split(".fa")[0] + name_part_to_expand_to
!mv {file} {new_file_name}
# fnmatch use Example with reading and modifying the current matched file
import fnmatch
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*mt.fsa'):
print (file)
# !blastn -query {file} -db chrmt.fsa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq" -out {file[:-9]}x.chrmt.comp.txt
blast_result = !blastn -query {file} -db S288c.mt.genome.fa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq"
blast_df = blast_to_df(blast_result.n, pickle_df=False)
#...
print(start_loc)
with open(file) as handle:
mito_seq = SeqIO.read(handle, "fasta")
# fix, based on where it says "i.e. shift the starting point on this plasmid," @
#http://biopython.org/DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html
left = mito_seq[:start_loc] # use one less than what matches '1' in
# those cases because of zero indexing in Python; gets handled by that
# substraction above where `start_loc` defined
right = mito_seq[start_loc:]
adj_mito_seq = right + left
# write result after fix
sys.stderr.write("\n\nFile with adjusted 'start' saved as "
"'{}'.".format(generate_new_name(file),))
SeqIO.write(
adj_mito_seq, generate_new_name(file), "fasta");
# Use fnmatch to skip if extension has more beyond what is searched and exclude a specific file
fn_to_check = "pep.fa"
sequences = ""
import os
import fnmatch
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*'+fn_to_check):
if not file.endswith(".fai") and file != "DBVPG6044.mt.pep.fa":
sequences += get_seq_from_multiFASTA_with_match_in_description(
file,gene_to_match, return_record_as_string=True)
# Use fnmatch to change the names of files with specific extensions (in a subdirectory) to different extensions
#Change name of genome files from ending in `.fas` to ending in `.genome.fa` ; rename files, renaming files
genomes_dir = "genomes"
old_extension = ".fas"
new_extension = ".genome.fa"
import os
import fnmatch
for file in os.listdir(genomes_dirn):
if fnmatch.fnmatch(file, '*'+ old_extension):
!mv {genomes_dirn}/{file} {genomes_dirn}/{file.split(old_extension)[0]+new_extension}
# Package up a lot of various data sources and output streams (if trying to pack up
# just files, keep in mind using nbzip module or if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`)
python_data_to_grab = {
'FILE_NAME_TO_GENERATE_A.md':(name_of_listA,"DESCRIPTION_TAG_to_put_on_top_line_of_file:"),
'FILE_NAME_TO_GENERATE_B.md': (name_of_listB,"TAG_to_put_on_top_line_of_file:"),
'FILE_NAME_TO_GENERATE_C.md':(out_stream.stderr,"TAG_to_put_on_top_line_of_file:"),
}
# PYTHON 2.7 VERSION
import sys
import contextlib
data_tag = "some_descriptive_string_here_about_info"
# fix for python 2 based on https://stackoverflow.com/a/44226422/8508004
@contextlib.contextmanager
def redirect_stdout(target):
original = sys.stdout
sys.stdout = target
yield
sys.stdout = original
for file_name_to_use,py_obj_info in python_data_to_grab.items():
py_obj,data_name = py_obj_info
with open(file_name_to_use, 'w') as f:
with redirect_stdout(f):
print(data_name + " =")
print(str(py_obj))
# package up the files
!mkdir pertinent_data_for{set_designation}
for each_file in python_data_to_grab.keys():
!mv {each_file} pertinent_data/.
!tar czf pertinent_data_for{set_designation}.tar.gz pertinent_data/
sys.stderr.write("Useful information for the set saved as "
"`pertinent_data_for_{}.tar.gz`".format(data_tag))
# IT'D BE BETTER TO incorporate `%store` in above, I think. see https://stackoverflow.com/a/32731418/8508004 & above here
# identify several files via file names and fnmatch and package up without placing in a directory first
# (see under 'Collecting all the results' in `Annotating mito sequences extracted from XXXX collection with MFannot and converting annotation file to gff3.ipynb` if need example with putting into directory first just using bash shell commands or
# search `!mv {each_file} pertinent_data/.` here for something similar)
archive_file_name = "annotations_for_four_putative_mitos_from332.tar.gz"
import os
import sys
import fnmatch
dl_files = []
name_part_to_match = ".new"
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*'+name_part_to_match):
#print (file)
#first_part_filen = file.rsplit(extension_to_handle,1)[0]
dl_files.append(file)
!tar czf {archive_file_name} {" ".join(dl_files)}
sys.stderr.write("***************************DONE***********************************\n"
"'{}' generated. Download it.\n"
"***************************DONE***********************************".format(archive_file_name))
#Note that when I added in the use of the `--transform` flag into making a tar, it disrupted use of passing Python into shell commands and so even after consultig Claude and trying somethings
#I found easier to just hardcode in even if blatently redundant and breaking DRY, like so:
archive_file_name = "results_for_merged_set.tar.gz"
!tar czf {archive_file_name} --transform 's/^\./results_for_merged_set/' {" ".join(list_of_files)}
#package up several files for download (if trying to pack up just files, keep in mind using nbzip module as alternative if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`)
# make one file for downloading
archive_file_name = "collected_files.tar.gz"
dl_files = [x + "_tag.fa" for f in file_list]
!tar czf {archive_file_name} {" ".join(dl_files)}
sys.stderr.write("*****************DONE***********************************\n"
"'{}' generated. Download it.\n"
"*****************DONE***********************************".format(archive_file_name))
#package up several files and files made from captured output stream for download, where (keep in mind using nbzip module as alternative if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`)
# make one file for downloading
archive_file_name = "collected_files.tar.gz"
dl_files = [x + "_tag.fa" for f in file_list]
# save & add the additional information files to collect
output_txt_filename_a = 'seqs_filtered_info.txt'
output_txt_filename_b = 'seqs_filtered.txt'
%store captured_stream_a.stderr >{output_txt_filename_a} #based on https://stackoverflow.com/a/32731418/8508004
%store captured_stream_b.stdout >{output_txt_filename_b} #based on https://stackoverflow.com/a/32731418/8508004
dl_files += [output_txt_filename_a, output_txt_filename_b] # or if really only one, `dl_files.append(filtered_out)`
!tar czf {archive_file_name} {" ".join(dl_files)}
sys.stderr.write("*****************DONE***********************************\n"
"{} generated. Download it.\n"
"*****************DONE***********************************".format(archive_file_name))
# note based on https://stackoverflow.com/a/32731418/8508004, used `%store` to replace above
with open(output_txt_filename_a, 'w') as output_handler:
output_handler.write(captured_stream_a.stderr)
with open(output_txt_filename_b, 'w') as output_handler:
output_handler.write(captured_stream_b.stdout)
# Package up several dataframes and sequences
#Archive the CTD sequences (FASTA format) collected and any dataframes made
# Pickle each dataframe and also save as `tsv` for possible use elsewhere
strd_dataframes_fn_list = []
def pickle_df_and_store_as_table(dataframe, prefix):
'''
Take a dataframe and a filename prefix and save a pickled form of that
dataframe and a text tablular data version (tab-sepearated values).
Returns the name of the pickled and text file.
'''
dataframe.to_pickle(prefix + ".pkl")
dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
return prefix + ".pkl", prefix + ".tsv"
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
# associated as the value
df_n_fn_dict = {
"CTD_seq_of_protein_orthologs": CTD_seq_df,
"first_heptad_of_protein_orthologs": first_7_df,
"heptads_ofCTD_seq_of_protein_orthologs": repeat_df,
"main_heptads_ofCTD_seq_of_protein_orthologs": repeat_wo_first_df,
"fraction_matching_consensus_per_CTD": fraction_consensus_df,
}
import pandas as pd
for prefix, dataframe in df_n_fn_dict.items():
#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))
# store `CTD_seqs_fn_list` as json since lighter-weight and more portable than pickling
CTD_seqs_fn_list_storedfn = "CTD_seqs_fn_list.json"
import json
with open(CTD_seqs_fn_list_storedfn, 'w') as f:
json.dump(CTD_seqs_fn_list, f)
# see my useful python snippets for reading json back in
#for ease in aligning or other uses later save the all the CTDs as a concatenated file
cat_fasta_fn = "CTD_seq_of_protein_orthologs.fa"
# !cat {" ".join(CTD_seqs_fn_list)} > {cat_fasta_fn} # faster but not as good as awk if files don't already have newlines at end;
# just results in the lines of the files as one long run on that won't work for aligning
!awk 1 {" ".join(ortholog_prot_seqs)} > {cat_fasta_fn} #based on https://stackoverflow.com/a/25030513/8508004
archiving_fn_list = CTD_seqs_fn_list + strd_dataframes_fn_list + [CTD_seqs_fn_list_storedfn , cat_fasta_fn]
archive_file_name = gene_name+"_orthologs_extracted_CTDs.tar.gz"
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
sys.stderr.write("\nCollected CTD sequences"
" and tables of details gathered and saved as "
"`{}`.".format(archive_file_name))
# Package up several dataframes and a list of genomes (see just below for only with dataframes)
# Pickle each dataframe and also save as `tsv` for possible use elsewhere
strd_dataframes_fn_list = []
# store `genomes` as json since lighter-weight and more portable than pickling
# for easy json dumping for many list use when archiving:
file_names_for_lists_dict = {
"genomes_list.json":genomes,
}
import json
for fn, lizt in file_names_for_lists_dict.items():
with open(fn, 'w') as f:
json.dump(lizt, f)
def pickle_df_and_store_as_table(dataframe, prefix):
'''
Take a dataframe and a filename prefix and save a pickled form of that
dataframe and a text tablular data version (tab-sepearated values).
Returns the name of the pickled and text file.
'''
dataframe.to_pickle(prefix + ".pkl")
dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
return prefix + ".pkl", prefix + ".tsv"
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
# associated as the value
df_n_fn_dict = {
"mito_promoter_matches_df": df,
"mito_promoter_hit_num_tallies_by_id_df": largest_hit_num_by_id_df,
"disruptor_matches_df": disrupt_df,
"disruptor_hit_num_tallies_by_id_df": largest_disr_num_by_id_df,
"grich_matches_df": grich_df,
"grich_hit_num_tallies_by_id_df": largest_grich_num_by_id_df,
"endgrich_matches_df": end_grich_df,
"endgrich_hit_num_tallies_by_id_df": largest_endgrich_num_by_id_df,
"twenty_nineATrich_seq_matches_df": twenty_nine_df,
"twenty_nineATrich_seq_hit_num_tallies_by_id_df": largest_ATrich_num_by_id_df,
}
import pandas as pd
for prefix, dataframe in df_n_fn_dict.items():
#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))
archiving_fn_list = strd_dataframes_fn_list + list(file_names_for_lists_dict.keys())
archive_file_name = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz"
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
sys.stderr.write("\nCollected dataframes"
" and tables of details gathered and saved as "
"`{}`.".format(archive_file_name))
# for archiving just several dataframes with automated handling
archive_file_name = "dataframes_archived.tar.gz"
strd_dataframes_fn_list = []
def pickle_df_and_store_as_table(dataframe, prefix):
'''
Take a dataframe and a filename prefix and save a pickled form of that
dataframe and a text tablular data version (tab-sepearated values).
Returns the name of the pickled and text file.
'''
dataframe.to_pickle(prefix + ".pkl")
dataframe.to_csv(prefix + ".tsv", sep='\t',index = False)
return prefix + ".pkl", prefix + ".tsv"
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix
# associated as the value
df_n_fn_dict = {
"df": df,
"another_df": another_df,
"yet_another_df": yet_another_df,
}
import pandas as pd
for prefix, dataframe in df_n_fn_dict.items():
#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix)
strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix))
archiving_fn_list = strd_dataframes_fn_list
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command
sys.stderr.write("\nCollected dataframes"
" gathered and saved as "
"`{}`.".format(archive_file_name))
# for easy json dumping for many list use when archiving:
file_names_for_lists_dict = {
"annotation_fns.json":annot_fns,
"genome_fnss.json":genomes_for_anot_fns,
"fn_pairings.json":file_pairs,
}
import json
for fn, lizt in file_names_for_lists_dict.items():
with open(fn, 'w') as f:
json.dump(lizt, f)
# Use curl to get an FASTA file from OCA and remove html tags (may need `!pip install BS4` first)
# Get FASTA file for the non yeast one
import os
#!curl -o 1x0t_A.fa http://oca.weizmann.ac.il/oca-bin/send-seq?1x0t_A
os.system("curl -o 1x0t_A.fa http://oca.weizmann.ac.il/oca-bin/send-seq?1x0t_A")
# remove HTML to leave actual FASTA
# based on https://stackoverflow.com/a/21577649/8508004 and https://unix.stackexchange.com/a/64747
import sys
from bs4 import BeautifulSoup
oca_file_to_fix = "1x0t_A.fa"
soup = BeautifulSoup(open(oca_file_to_fix), "html.parser")
for pre in soup.findAll("pre"):
fasta = pre.contents
%store fasta[0] >{oca_file_to_fix}
# NOTE ABOUT THE READING PART OF THIS NEXT BLOCK: seems more modern Pythonic way
# is to leave out the `,'r'` part. See https://stackabuse.com/read-a-file-line-by-line-in-python/ under
# 'Read a File Line-by-Line with a for Loop - Most Pythonic Approach'. Note also that
# best to use `.strip()` or possibly slice `[:-1]` to remove line ending if going to
# rearrange because can get weird merge if alter order because usually last line will not have a new
# line character.
# add identifiers to each `chr` so results for each strain clear later
chromosome_id_prefix = "chr"
def add_strain_id_to_description_line(file,strain_id):
'''
Takes a file and edits every description line to add
strain_id after the caret.
Saves the fixed file
'''
import sys
output_file_name = "temp.txt"
# prepare output file for saving so it will be open and ready
with open(output_file_name, 'w') as output_file:
# read in the input file
with open(file, 'r') as input_handler: # OR SEE NOTE ABOVE THIS CODE BLOCK HOW DON'T NEED `, 'r'` anymore.
# prepare to give feeback later or allow skipping to certain start
lines_processed = 0
for line in input_handler:
lines_processed += 1
if line.startswith(">"):
rest_o_line = line.split(">")
new_line = ">"+strain_id + rest_o_line[1]
else:
new_line = line
# Send text to output
output_file.write(new_line)
# replace the original file with edited
!mv temp.txt {file}
# Feedback
sys.stderr.write("\n{} chromosome identifiers tagged.".format(file))
for s in yue_et_al_strains:
add_strain_id_to_description_line(s+".genome.fa",s)
# A find / replace similar to last example but pure Python (no-IPython magics or shell use)
# (See `testing_repeat_number_by_looping_bendit_analysis.ipynb` for practical use of this to change a script on a loop to monitor effect on outcome)
script_name = "donut_plot_with_subgroups_from_dataframe.py"
def change_original_title(s):
'''
Change the plot title to the provided text.
'''
with open(script_name, 'r') as thefile:
script=thefile.read()
script = script.replace('BREAKDOWN', s)
with open(script_name, 'w') as output_file:
output_file.write(script)
change_original_title("NEW TITLE GOES HERE")
# Note for making substituons, Python now allows you to use f-strings (formatted string literals ) substitute
# variables into strings by name, but also Python strings have 'Template strings' built in (Ex. `import string; t = string.Template('Hello, $name!'); print(t.substitute(name='World'))`
# Collect list of image files in a directory
# Run this in notebook that is in directory along with the folder containing
# images, i.e., is in the level above the actual images
import os
import sys
try:
from pathlib import Path
except ImportError:
from pathlib2 import Path
img_folder = "Untitled Folder"
img_file_extensions = [".png",".jpg",".jpeg"]
list_imgs_in_directory = []
for file in os.listdir(img_folder):
#print (file)
if Path(file).suffix in img_file_extensions:
list_imgs_in_directory.append(file)
len(list_imgs_in_directory)
#Pathlib in Python 2 or 3 example:
try:
from pathlib import Path
except ImportError:
from pathlib2 import Path
# list all files in a directory
[item for item in Path('.').glob('*')] # based on
# https://jefftriplett.com/2017/pathlib-is-wonderful/
# list final file extension , see 'Path.suffix' at
#https://docs.python.org/3/library/pathlib.html
[item.suffix for item in Path('.').glob('*')]
# list the final suffixes if there is more than one - see 'Path.suffixes' at
#https://docs.python.org/3/library/pathlib.html
# Collect list of image files in a directory and display them in a Jupyter
# notebook cell
# Run this in notebook that is in directory along with the folder containing
# images, i.e., is in the level above the actual images
import os
import sys
try:
from pathlib import Path
except ImportError:
from pathlib2 import Path
from IPython.display import Image
from IPython.display import display
img_folder = "Untitled Folder"
img_file_extensions = [".png",".jpg",".jpeg"]
list_imgs = []
for file in os.listdir(img_folder):
#print (file)
if Path(file).suffix in img_file_extensions:
list_imgs.append(Path(img_folder,file))
imgl = [Image(filename=str(x)) for x in list_imgs] #had to cast the
# path object to a string or else `display.py` was giving error
# `'PosixPath' object has no attribute 'split'`;seems `display.py` not able to
# handle path objects yet.
display(*imgl)
# Collect list of image files in a directory and display them in a Jupyter
# notebook cell WITH FILE NAMES SHOWN BELOW EACH
# Run this in notebook that is in directory along with the folder containing
# images, i.e., is in the level above the actual images
import os
import sys
try:
from pathlib import Path
except ImportError:
from pathlib2 import Path
from IPython.display import Image
from IPython.display import display
img_folder = "Untitled Folder"
img_file_extensions = [".png",".jpg",".jpeg"]
list_imgs = []
for file in os.listdir(img_folder):
#print (file)
if Path(file).suffix in img_file_extensions:
list_imgs.append(Path(img_folder,file))
for i in list_imgs:
display(Image(filename=str(i)))
print("ABOVE: {}".format(i.name))
#slide carousel-like example to show a subset of images that changes every five seconds (from `demo_palette.ipynb` in pymol-binder) with HTML labels for each image to make the text stand out:
import IPython.display as ipd
import time
import os
import sys
import random
def display_subset():
img = {}
for x in random.sample(range(shuffles_to_do), 3):
img[x] = ipd.Image(filename="img_{}.png".format(x))
ipd.display(img[x])
ipd.display(ipd.HTML('ABOVE:&nbsp;<font size=5><b>img_{}.png</b></font>'.format(x)))
time.sleep(5)
ipd.clear_output(wait=True)
while True:
display_subset()
# Subset / restrict to a random sampling of items in a list , based on https://pynative.com/python-random-sample/
# Good for doing right before EVEYRTHING GETS PROCESSED to pick a subset for testing, instead
# of defining specifically
import random
genomes = random.sample(population=genomes, k=15)
# Run a function every 8 minutes
%load https://gist.githubusercontent.com/fomightez/b012e51ebef6ec58c1515df3ee0c850a/raw/300da6c67ceeaf5384a3e500648b993345c361cb/run_every_eight_mins.py
# RELOAD for when you are using `from python_file_containing_function import a_function` (Python 3)
# Reload a function into a notebook after editing the script file in editor of running session;
# this allows calling the function in the notebook whereas if just reload the script won't
import importlib
import python_file_containing_function; importlib.reload(python_file_containing_function); from python_file_containing_function import a_function
# above line from https://stackoverflow.com/a/11724154/8508004
# RELOAD for when you are using `import python_file` (Python 3)
# Reload a script into a notebook after editing the script file in editor of running session;
# note it is much more easily done then the case where using `from foo import foo`, but
# `from foo import foo` makes it easier to work in a notebook in many ways.
import importlib;importlib.reload(import python_file)
# Create a download link in Jupyter notebook; from
# https://medium.com/ibm-data-science-experience/how-to-upload-download-files-to-from-notebook-in-my-local-machine-6a4e65a15767
# <-- Haven't tried it yet but it might be handy
# for idea I am working on for making animations from pymol files using jmol or any where where I suggest
# downloading an archive of results
from IPython.display import HTML
def create_download_link( df, title = "Download CSV file", filename = "data.csv"):
csv = df.to_csv()
b64 = base64.b64encode(csv.encode())
payload = b64.decode()
html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
html = html.format(payload=payload,title=title,filename=filename)
return HTML(html)
create_download_link(df)
# For handling archive files to make a clickable download link, I found the section 'Create and download CSV/zip file' at https://blog.softhints.com/jupyter-ipython-download-files/ ; however, the code seems incomplete as I don't see how they make the zip file in conjunction with the sending it through as payload. (I assume `create_download_files()` was triggered elsewhere already.) And minor thing too, why not returning `HTML(html)` in that code block?
# Maybe Some of the answers here might help me reverse that Zipfile approach so it works to download to local?
# https://stackoverflow.com/questions/5710867/downloading-and-unzipping-a-zip-file-without-writing-to-disk
# Related to the topic of making downloadable links from Jupyter pages, I found https://stackoverflow.com/questions/26497912/trigger-file-download-within-ipython-notebook
# and
# https://stackoverflow.com/questions/24437661/retrieving-files-from-remote-ipython-notebook-server/24439480#24439480 about
# FileLink / FileLinks; however, in JupyterLab if it is a gif or png that JupyterLab renders, it opens it in the application
# instead of allowing download. And if it is a tarball that it doesn't render and you click on it, instead of offering to download
# it sats it isn't UTF-8 encoded.
# Fortunately when in Voila apps, you can list the files with the following:
from IPython.display import FileLink, FileLinks
FileLinks(".")
# And in VOILA those can be right clicked on and downloaded to local drive from those links using `Save link as..`.
# Howeve, a better, relatedsolution for in Voila because it makes a pop-up automatically without needing user to use `Save as` is:
%%html
<a href="SVM_Confusion_Matrix.jpg" download="SVM_Confusion_Matrix.jpg">Click HERE to Download SVM image</a>
# Using Panel (installable via pip) in a notebook (NOT VOILA) you can make a download file, too:
import panel as pn
pn.extension()
# Create option to download SVM Confusion Matrix Graphic
pn.widgets.FileDownload(
file="SVM_Confusion_Matrix.jpg",
embed=False,
name="Save SVM Confusion Matrix image"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment