fomightez/useful_python_snippets.py

## useful_python_snippets.py
# These are meant to work in both Python 2 and 3, except where noted.

# See my useful_pandas_snippets.py for those related to dataframes (such as pickling/`df.to_pickle(save_as)`)
# https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4
# also see https://gist.github.com/fomightez/324b7446dc08e56c83fa2d7af2b89a33 for examples of my
# frequently used Python functions and slight variations for more expanded, modular structures.


#argparse
# good snippet collection at https://mkaz.tech/code/python-argparse-cookbook/
# positional
parser.add_argument("input", help="Name of the file that was \
    generated by other program \
    when run with your transcriptome of interest.", metavar="INPUT_FILE")

# with optional positional
parser.add_argument("input", nargs='?', help="**OPTIONAL**Name of the file \
    generated by other program \
    when run with your transcriptome of interest. Usually, this is \
    '"+input_file_name_default+"' &\
    if no input file name is provided then this will be used by \
    default.", default=input_file_name_default, metavar="INPUT_FILE")
# Note see https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse#comment35222484_18863004
# for why not using `argparse.FileType` approach here.
# See
# https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments
# and
# https://docs.python.org/2/library/argparse.html#nargs for use of `nargs='?'`
# to make input and output file names optional. Note that the square brackets
# shown in the usage out signify optional according to
# https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments#comment40460395_4480202
# , but because placed under positional I added clarifying text to help
# description.
# IF MODIFYING THIS SCRIPT FOR USE ELSEWHERE AND DON'T NEED/WANT THE OUTPUT
# FILE TO BE OPTIONAL, remove `nargs` (& default?) BUT KEEP WHERE NOT
# USING `argparse.FileType` AND USING `with open` AS CONSIDERED MORE PYTHONIC.

# With list where won't know the exact size of list, i,e. could be one (Example from `plot_expression_across_chromosomes.py`; see `donut_plot_with_subgroups_from_dataframe.py` for another)
parser.add_argument('-chr', '--chrs', action='store', type=str,
    help="use this flag to limit plotting of the data to particular \
    chromosomes or scaffolds you specify immediately following this flag. \
    Separate the chromosome or scaffold identifiers by commas, without spaces. \
    Example use in a command is `--chrs I,IV,XVI`. \
    Default when this optional flag is not called is to plot that data for all \
    chromosomes or scaffolds. ") # based on
    # https://stackoverflow.com/questions/15753701/argparse-option-for-passing-a-list-as-option
    # ; specifically, https://stackoverflow.com/a/24866869/8508004
#...
if args.chrs:
if "," in args.chrs:
    limit_to_chrs = args.chrs.split(',')
else:
    # means only one item
    limit_to_chrs = [args.chrs] #has to be a list for passing to Pandas `isin()`

# with flag parameters
parser.add_argument('-sa', '--save_as', action='store', type=str,
    default= generate_output_file_name(previous_pickled_df), help="Use \
    this option to supply a name of \
    the file to save for storing produced dataframe. If none is provided, \
    the name \'"+generate_output_file_name(previous_pickled_df)+"' will be \
    used. To force nothing to be saved, enter \
    `-sa no_output` without quotes as output file (ATYPICAL).")
# with choices
# see https://stackoverflow.com/a/35970231/8508004
# or https://stackoverflow.com/a/15301183/8508004
# or https://stackoverflow.com/questions/40324356/python-argparse-choices-with-a-default-choice
parser.add_argument("-og", "--output_grouping", type=str,
        default= "single", choices=["single", "separate", "both"],
        help="OPTIONAL: Specify grouping of output with this option. Choose \
        `-og single` for one table or dataframe for all categories. Or choose \
        `-og separate` for a separate table or dataframe for each category. \
        Or specify `-og both` to output both types. \
        If this option is not specified, {} will be used.".format("single"))

# Now DISFAVORED approach for reading in files. (disfavored because use of `with` favored and not compatible with that. So only use when really in a hurry to scrape something together; see https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse#comment35222484_18863004)
parser.add_argument("MTA", help="Name of file containing data. REQUIRED.", type=argparse.FileType('r'), metavar="FILE")
#I would also like trigger help to display if no arguments provided because need at least one input file
if len(sys.argv)==1:    #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu
    parser.print_help()
    sys.exit(1)
args = parser.parse_args()
the_file = args.MTA
# open input file and start reading
sys.stderr.write("\nReading input file...")
#input_file_stream = open(the_file, "r") # Don't need separate open when use `type=argparse.FileType`. It sets everything up automatically and you will actually cause errors if try to open when already open.
for line in thefile:
    pass


# assert
assert len(numbers_given_for_start_n_end) == len(aln_ids_in_out_order), (
    "The user-supplied list must be equal in length to list of data"
    "previously supplied as 'aln_ids_in_out_order'.")
# Verify that values in dictionary are equal in length using `count` method of Python lists
lengths_of_sequences = [len(v) for v in sequence_dict.values()]
assert lengths_of_sequences.count(
    lengths_of_sequences[0]) == len(lengths_of_sequences), "The length "
    "of all parsed sequences should be the same." # see where
    # that assertion test involving Ivo van der Wijk's solution from
    # https://stackoverflow.com/a/3844948/8508004


# Getting hex color codes and RGB values from out of Seaborn color palettes / show colors
# based on https://stackoverflow.com/questions/38249454/extract-rgb-or-6-digit-code-from-seaborn-palette
# WORKS IN A JUPYTER NOTEBOOK CELL
import seaborn as sns
#num_shades = 8
#sns.palplot(sns.cubehelix_palette(num_shades))
pal = sns.color_palette("RdBu_r")
print(pal.as_hex())
print(pal)
sns.palplot(sns.color_palette("RdBu_r"))
sns.palplot(sns.diverging_palette(5, 250))


# Check if string can be cast to number and cast (example from `donut_plot_with_subgroups_from_dataframe.py`)
def is_number(s):
    '''
    check if a string can be cast to a float or numeric (integer).

    Takes a string.

    Returns True or False
    fixed from https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/
    later noted similar code is at https://code-maven.com/slides/python-programming/is-number
    '''
    try:
        float(s)
        return True
    except ValueError:
        pass
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

def cast_to_number(s):
    '''
    Cast a string to a float or integer.
    Tries casting to float first and if that works then it tries casting the
    string to an integer. (I thought I saw suggestion of that order somewhere
    when searching for what I used as `is_number()` check but cannot find source
    right now.)

    Returns a float, int, or if fails, False. (Where using, it shouldn't ever
    trigger returning `False` because checked all could be converted first.)

    based on fixed code from https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/
    '''
    try:
        number = float(s)
        try:
            number = int(s)
            return number
        except ValueError:
            pass
        return number
    except ValueError:
        pass
    try:
        import unicodedata
        num = unicodedata.numeric(s)
        return num
    except (TypeError, ValueError):
        pass
    return False


# count frequency
def count_frequency_in_list(l):
    '''
    takes a list and returns a dictionary of the counts of the items in that
    list

    based on https://stackoverflow.com/a/2162045/8508004
    '''
    import collections
    return collections.Counter(l)

clusters_nofeat = clusters_nofeat_df["Acluster(+)"].tolist() + clusters_nofeat_df["Acluster(-)"].tolist()
# split up those that contain more than one, i.e., separated by comma, and strip whitespace;
# expanded list comprehension from https://stackoverflow.com/a/27886807/8508004
clusters_nofeat = [x.strip() for xs in clusters_nofeat for x in xs.split(',')]
# remove the blank ones /empty ones (this is how you remove empty strings from a list) ; it can also be used to remove blank likes from splits on `"\n"`
clusters_nofeat = [x for x in clusters_nofeat if x]
# remove blank lines & lines that are just tabs and empty spaces (happens to be from a different example than most of these 'remove' examples in this section
input_data = [x for x in input_data.split("\n") if x.strip()]
# remove duplicates
clusters_nofeat = set(clusters_nofeat)
# remove the dash and everything after to count frequency
clusters_nofeat = [x.split("-")[0] for x in clusters_nofeat]
# count frequency
counts=count_frequency_in_list(clusters_nofeat)
clusters_nofeat
# next line based on #based on https://stackoverflow.com/questions/2161752/how-to-count-the-frequency-of-the-elements-in-a-list#comment46593992_2162045
# ; otherwise get `Counter()` type
dict(counts)


most_common,num_most_common = Counter(some_list).most_common(1)[0] # based on
# https://stackoverflow.com/a/6987358/8508004

#unique items in list that occur more than two times (or some other number you specify)
the_count = Counter(some_list)
print ([k for k, v in the_count.items() if v > 2]) # based
# on https://stackoverflow.com/a/26773120/8508004 and
# https://stackoverflow.com/a/30418498/8508004 , to work in 2.7 and 3


# incrementing count by instance while interating on a list (could be applied to a dataframe column too)
# based on https://stackoverflow.com/a/1692428/8508004
from collections import defaultdict
l = ['apple','cherry','apple','cherry','cherry','pear']
d = defaultdict(int)
for x in l:
    d[x] += 1
    print ("That is {} #{}".format(x,d[x]))
print("final counts: {}".format(dict(d)))

# count the number of non-overlapping occurrences of a substring
num_sequences = file_listing_text.count('.fa')


#combinining counting and regular expressions (regex)
# count frequency of blocks of Ns in a string (presumably sequence)
import re
from collections import defaultdict
t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN"
matches = []
len_match_dict = defaultdict(int)
min_number_Ns_in_row_to_collect = 1
pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I)  # adpated from
# code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on
# https://stackoverflow.com/a/250306/8508004
for m in pattern_obj.finditer(t):
    len_match_dict[len(m.group())] += 1
    matches.append(m.group())
print(len_match_dict)
print(collections.Counter(matches))


#copy a list
# Important if you'll be itersting on it a modifying it at same time. Modify the copy or iterate on the copy but not both.
# Also useful if you want to start a new list with contents of old. If you just do `new_list = old_list`, that just copies
# the reference to the list, see https://stackoverflow.com/a/2612815/8508004 . Won't probably be what you want.
new_list = old_list.copy()

# Split a list into chunks with also collecting any remainder group
# from https://stackoverflow.com/a/312464/8508004   ===> UPDATE For 2023 onward: in version 3.12, Python added `itertools.batched()` that chunks, see https://twitter.com/patloeber/status/1613190998389030913

def chunks(a_list, n):
    """Yield successive n-sized chunks from list."""
    for i in range(0, len(a_list), n):
        yield a_list[i:i+n]


print (list(chunks(range(10, 75), 10)))

# also see [`more_itertools.chunked()`](https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.chunked) that I learned of from https://twitter.com/PeakSquirrel/status/1574893067693215744
# there is another approach with list comprehension for a string in this gist that is based
# on https://stackoverflow.com/a/13673133/8508004 and looks similar to list comprehension one for
# lists at https://stackoverflow.com/a/32467096/8508004

# RELATED: chunking with step/stride of certain amount to produce overlap. For example to reproduce process
# described in this figure legend: " The data were calculated from 150-base pair windows with 50 base pairs of overlap between adjacent windows and plotted ..."
a = "adhaskjhdjkashdajkshdkjashdaslalllslslslsaashdahs"
chunk_size = 10
step_size = 5
def chunk_string_with_different_step(string, chunk_size, step_size):
    """Return a list of n-sized chunks from string of letters."""
    return [string[i:i+chunk_size] for i in range(0, len(string),step_size)]
chunk_string_with_different_step(a,chunk_size,step_size)
'''
#RESULT:
['adhaskjhdj',
 'kjhdjkashd',
 'kashdajksh',
 'ajkshdkjas',
 'dkjashdasl',
 'hdaslallls',
 'alllslslsl',
 'lslslsaash',
 'saashdahs',
 'dahs']
'''
# AS GENERATOR:
b = "adhaskjhdjkashdajkshdkjashdaslalllslslslsaashdahs"
def gen_chunk_string_with_different_step(a_list, chunk_size, step_amount):
    """Yield successive n-sized chunks from list, stepping /stride by step_amount."""
    for i in range(0, len(a_list), step_amount):
        yield a_list[i:i+chunk_size]
print (list(gen_chunk_string_with_different_step(b,chunk_size,step_size)))
'''
#RESULT:
['adhaskjhdj', 'kjhdjkashd', 'kashdajksh', 'ajkshdkjas', 'dkjashdasl', 'hdaslallls', 'alllslslsl', 'lslslsaash', 'saashdahs', 'dahs']
'''


# Debugging
# Insert the following into a script at the point you'd like to debug to bring up interactive IPyton console
# you can query for status of defined variables:
from IPython import embed; embed()


#dictionary
for key in d:
    pass

for key, value in d.items():
    pass

if key in d:
    pass

# some example dictionary comprehension ( general idea: d = {k:v for k,v in a.items()}  ) are at
# https://stackoverflow.com/questions/1031851/python-best-way-to-exchange-keys-with-values-in-a-dictionary
# in answer to "Python: Best Way to Exchange Keys with Values in a Dictionary?"

# merge two dictionaries --> see https://www.geeksforgeeks.org/python-merging-two-dictionaries/
d = {**dict1, **dict2}    # or d = (dict2.update(dict1))


# do-while loop (https://stackoverflow.com/a/1662176/8508004)
while True:
    do_something()
    if condition():
        break


# get file extension from file name
# Now I'd use Pathlib, see https://docs.python.org/3/library/pathlib.html `Path(filename_n_path).suffix`
#; for main part using `Path(filename_n_path).stem` see more about Pathlib use in this document under 'Pathlib' below
def generate_output_file_name(file_name,suffix_for_saving):
    '''
    Takes a file name as an argument and returns string for the name of the
    output file. The generated name is based on the original file
    name.


    Specific example
    =================
    Calling function with
        ("sequence.fa", "_col")
    returns
        "sequence_col.fa"
    '''
    main_part_of_name, file_extension = os.path.splitext(
        file_name) #from
    #http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
    if '.' in file_name:  #I don't know if this is needed with the os.path.splitext method but I had it before so left it
        return main_part_of_name + suffix_for_saving  + file_extension
    else:
        return file_name + suffix_for_saving + ".fa"


# Floor
import math
x = int(math.floor(10.4)) #outer `int()` typecast in Python 3 is redundant
print (x)
# outer typecast insures same result (an integer) is returned in both
# Python 2 and 3 since Python 2 floor returns a float
# Not could get same by typecast to integer alone but use of 'floor' makes it more explicit as to what was sought.

# Get HTML / URL in Python 2 or 3
# Getting html originally for just Python 3, adapted from
# https://stackoverflow.com/a/17510727/8508004 and then updated from to
# handle Python 2 and 3 according to same link.
# (snippet with bonus Python 2 and 3 compatible variable unpacking and unicode decoding)
url = "http://www.example.org"
try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
html = urlopen(url)
for line in html.read().splitlines():
    #name, chrom_len, *_ = line.strip().split()
    # that elegant unpack above is based on
    # https://stackoverflow.com/questions/11371204/unpack-the-first-two-elements-in-list-tuple
    # , but it won't work in Python 2. From same place, one that works in 2:
    name, chrom_len = line.strip().split()[:2]
    chrom_and_length[name.decode(
        encoding='UTF-8')] = chrom_len.decode(encoding='UTF-8')

# generate names for sub-groups automatically,  à la `subset_a`, `subset_b`, etc.
first_group_suffix = "a"
groupings = ["subset"+ chr(ord(first_group_suffix) + x) for x in range(groups_to_make)]
# `chr(ord(first_group_suffix) + x)` part of line above based on https://stackoverflow.com/a/2156898/8508004


# greater than or equal and less than or equal
assert seq_step_size <= shortest_feature_len, ("problem")
# >= for greater than or equal
# <= for less than or equal
# related use in interval comparison:
if 10000 <= number <= 30000:
    pass


# check if any items in two lists are shared (any overlap / overlapping?)
bool(set(a) & set(b))


#list comprehension if and if-else conditionals
[x+1 if x >= 45 else x+5 for x in l]
# FOR JUST IF:
#"The if should be after the for (unless it is in an if-else ternary operator)
[y for y in a if y not in b]
#This would work however:
[y if y not in b else other_value for y in a]
#from https://stackoverflow.com/a/15474969/8508004
#My edited version of "just if" variation:
[x for x in list_for_order if x in b]

# List comprehension to add two items for each initial item in a list
# This example adds the number in the list plus the next value in the series
# for each item in a list.
#based on Games Brainiac comment at https://stackoverflow.com/a/19466238/8508004 and DSM's
# comment at https://stackoverflow.com/a/11869360/8508004
l = [1,2,4]
[item for x in l for item in [x, x+1]]   # results in `[1, 2, 2, 3, 4, 5]`
# see related information about `s.shift()` for getting a row and the next one in pandas snippets

# plus see some nice list comprehensions in the 'count frequency' section above
# for the `count_frequency_in_list()` function
# plus there are these:
#remove all the blanks, i.e. `''` entries, in the sub-lists, and then
element_record[0] = [x for x in element_record[0] if x]
element_record[1] = [x for x in element_record[1] if x]
#join the contents that remain with commas to make a single string of ids
element_record[0] = ", ".join(element_record[0])
element_record[0] = ", ".join(element_record[1])
# -or- in one step, DO BOTH
#remove all the blanks, i.e. `''` entries, in the sub-lists, and then
#join the contents that remain with commas to make a single string of ids
element_record[0] = ", ".join([x for x in element_record[0] if x])
elementr_record[1] = ", ".join([x for x in element_record[1] if x])
# make a copy of a list with a specific item removed
first_characters_wo_candidate = [x for x in first_characters if x != basic_tag] #based
# on https://stackoverflow.com/a/25004389/8508004 (this will remove ALL occurences);
# I couldn't find a way to both make a copy of the list and remove the first instance of that
# item using `.remove(item)` using Python 2.7. Always had to copy list and then use remove. Although
# potentially this looks like one line if have numpy already imported --> https://stackoverflow.com/a/50313691/8508004


# " one-liner providing both the value of the minimum as well as the first index where it is realized"
# from https://coderwall.com/p/a9hvrg/index-of-minimum-element-of-a-list where describes
# works because the default order of tuples in Python is lexicographical.
# Approach works for `max` (maximum) too.
# Needed slight adjusting to not use xrange for Python 3/2 compatibility.
l = [33, 788, 1, -14, 78, 11, 32, 11, 78, -1, -14]
mn,idx = min( (l[i],i) for i in range(len(l)) )
mn,idx


# next() with intertools
from itertools import cycle
#...
colors = (['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'xkcd:magenta', 'xkcd:orange',
  'tab:gray','tab:pink'])
  #...
colors = cycle(colors)
#...
color = next(colors)
# works in 2 and 3, see https://stackoverflow.com/questions/5237611/itertools-cycle-next


# Dealing with detecting if something was in region/ interval
total_feature_levels = max([v["level"] for v in features_dict.values()])
#...
loc_intvls_of_features = {(
v["start"],v["end"]): k for k,v in features_dict.items()} #dict of the intervals
# (keys) of sequence where features(values) are located
#...
# Use `loc_intvls_of_features` and get list of features where the
# intervals contain what corresponds to the current column. Three
# categories: at_start, in_mid_interval, at_end. Could sort later
# for coordinating stylistic settings but might as well do now
# since all information at hand.
at_start=[]
in_mid_interval = []
at_end = []
for interval,feature in loc_intvls_of_features.items():
    if interval[0] == curr_residue_number:
        at_start.append(feature)
    elif interval[0] < curr_residue_number < interval[1]:
        in_mid_interval.append(feature)
    elif interval[1] == curr_residue_number :
        at_end.append(feature)
# Next if there are contents in the feature lists, cycle through them
# stylizing table cells of current column appropriately.
for feature in at_start:
    stylize_start_features(feature, column, features_part)
for feature in in_mid_interval:
    stylize_mid_features(feature, column, features_part)
for feature in at_end:
            stylize_end_features(feature, column, features_part)

def overlap_exists(a, b):
    '''
    takes two intervals and returns if they overlap. The intervals would
    be defined by tuples of (start, stop).

    Examples:
    overlap_exists([20, 38],[1, 125])
    > True

    overlap_exists([10, 15], [20, 38])
    > False

    overlap_exists([10, 25], [20, 38])
    > True

    overlap_exists([10, 25], [25, 38])
    > False

    based on https://stackoverflow.com/a/2953979/8508004 and fact 0 is same
    as False and anything else is True.
    Modified to add the +1 because I want to be inclusive so even one
    shared basepair is an overlap
    '''
    return bool(max(0, (min(a[1], b[1]) - max(a[0], b[0])+1)))


# left justify by adding spaces. Really useful when creating multiple sequence alignments to control columns things show up in.
# based on https://stackoverflow.com/a/5676676/8508004
mviewlines_dict[i][0] = mviewlines_dict[i][0].ljust(len_longest_id)


# Minimum size of string in a list
print len(min(lizt_o_strings, key=len)) #based on https://stackoverflow.com/a/7228951/8508004


# operating system/shell dealings
# os.remove() to delete a file

# make a directory
import os, errno

try:
    os.makedirs(directory)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise
# move a file
shutil.move("path/to/current/file.foo", "path/to/new/destination/for/file.foo")
#for more see [Replacing Bash Scripting with Python](https://github.com/ninjaaron/replacing-bash-scripting-with-python)

#see `fnmatch` examples in the useful notebook snippets gist.
# see `glob.glob` examples of matching file extensions and filenames, etc., in the useful notebook snippets gist.


# Pathlib
# in addition to this see `pathlib` examples in useful notebook snippets gist.
#Pathlib in Python 2 or 3 example:
try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path
# list all files in a directory
[item for item in Path('.').glob('*')] # based on
# https://jefftriplett.com/2017/pathlib-is-wonderful/
# list final file extension , see 'Path(filenmae_n_path).suffix' at
#https://docs.python.org/3/library/pathlib.html
[item.suffix for item in Path('.').glob('*')]
# list the final suffixes if there is more than one - see 'Path.suffixes' at
#https://docs.python.org/3/library/pathlib.html
# main part without the extension is 'Path(filenmae_n_path).stem'


# Perform a function or carry out a calculation a certain percent of the time (i.e., with a random frequency)
import random
if random.random() < 0.8:
    do_something()
# example using it:
def get_unique_tag(string, length_of_tags):
    '''
    Takes a string and generates a unique tag of provided length
    related to the provided string.
    Alphanumerics in tag will be limited to those in the provided
    string. Although the same order is favored, there
    is a chance of returning characters out of order using
    shortuuid.
    shortuuid from https://github.com/skorokithakis/shortuuid
    '''
    import random
    if random.random() < 0.95:
        chunk_size = length_of_tags
        chunks = [string[i:i+chunk_size] for i in range(0, len(string),chunk_size)] #based on
        # https://stackoverflow.com/a/13673133/8508004 or https://stackoverflow.com/a/9475354/8508004
        '''
        # WRITEN AS A FUNCTION
        def chunk_string(string, chunk_size):
            """Return a list of n-sized chunks from string of letters."""
            return [string[i:i+chunk_size] for i in range(0, len(string),chunk_size)]
        '''
        # discard chunks too short
        chunks = [x for x in chunks if len(x)== length_of_tags]
        import random
        return random.choice(chunks).lower()
    else:
        set_alphabet(string)
        return uuid()[:length_of_tags].lower()

# Pickle Python objects for storage
# for simple objects json suggested by Martijn Pieters at https://stackoverflow.com/q/25464295/8508004
# as lighter-weight and more portable than Python's pickling. (Same page has a similar pickling
# example actually too under Mike McKerns answer. Good for a list of strings, for example:
# Save as json
import json
with open('filename_list.json', 'w') as f:
    json.dump(CTD_seqs_fn_list, f)

#Read as json
import json
with open('filename_list.json', 'r') as f:
    filename_list = json.load(f)


# Subset to a random sampling of items in a list , based on https://pynative.com/python-random-sample/
import random
genomes = random.sample(population=genomes, k=15)


# Read and write to a file
# prepare output file for saving so it will be open and ready
# NOTE ABOUT THE READING PART OF THIS: seems more modern Pythonic way
# is to leave out the `,'r'` part. See https://stackabuse.com/read-a-file-line-by-line-in-python/ under
# 'Read a File Line-by-Line with a for Loop - Most Pythonic Approach'. Note also that
# best to use `.strip()` or possibly slice `[:-1]` to remove line ending if going to
# rearrange because can get weird merge if alter order because usually last line will not have a new
# line character. Also see https://stackoverflow.com/a/3277516/8508004 for code that will read the entire file
# into memory, or  or line by line ,and remove all whitespace characters (newlines and spaces) from the end of each line.
with open(output_file_name, 'w') as output:

    # read in the input file; REMAMEBER BEST NOT TO USE `with open(input_file_name, 'r') as input:` BECAUSE `input()` is something in python you may want to use in your script later
    with open(input_file_name, 'r') as input_handle: # OR SEE NOTE ABOVE HOW DON'T NEED `, 'r'` anymore.
        # prepare to give feeback later or allow skipping to certain start
        lines_processed = 0
        # This gff3 doesn't have pertinent lines until line 10

        for line in input_handle:
            lines_processed += 1
            # This gff3 doesn't have pertinent lines until line 10
            if line.startswith("#") or lines_processed < 9:
                # Send text to output
                output.write(line)
            else:

                info = line.split("\t")
                #print(info)
                info[3] = str(adjust_pos(int(info[3]),ATP6_start_pos, chromosome_length))
                info[4] = str(adjust_pos(int(info[4]),ATP6_start_pos, chromosome_length))
                #print (info) # ONLY FOR DEBUGGING

                # Send text to output
                output.write(("\t").join(info))
# Feedback
sys.stderr.write("Positions were changed to match ATP6 as start "
    "and saved in '{}'.".format(output_file_name))

# Read entire file into memory at once(extra bouns example removes linebreaks to make a a long string) /Read all file at once
with open('data.txt', 'r') as myfile:
    data=myfile.read().replace('\n', '')

# REplace text in a file combining much of above approaches
script_name = "donut_plot_with_subgroups_from_dataframe.py"
def change_original_title(s, script_name):
    '''
    Change the plot title to the provided text in the file designated with `script_name`
    '''
    with open(script_name, 'r') as thefile:
        script=thefile.read()
    script = script.replace('BREAKDOWN', s)
    with open(script_name, 'w') as output_file:
        output_file.write(script)
change_original_title("NEW TITLE GOES HERE", script_name)


# Sort / Sorting
# FOR PYTHON FOR GENOMIC DATA SCIENCE I WAS LOOKING TO SORT A REPRESENTATION OF A DICTIONARY BASED ON VALUES
# see http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
import operator
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
sorted_x = sorted(x.items(), key=operator.itemgetter(1))

#Later I found is same without need for operator (and know to work in Python 3).:
sorted_keys = sorted(my_dict, key=my_dict.get) #based on https://stackoverflow.com/a/37270275/8508004
# to sort keys from dictionary based on value
# see my file `sorting out sorting on attributes within lists using keys.md` for more along these lines


# StringIO

try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

df = pd.read_table(StringIO(Input), header=0, delim_whitespace= True)
# verified for both Python 2 and 3


# stderr.write
import sys
sys.stderr.write("\n\nThe dataframe was not stored for use elsewhere "
            "because `no_output` was specified in place of the output file name.")
sys.stderr.write( "\n\nThe dataframe has been saved as a file in a "
                "manner where other Python programs can access\nthe created "
                "dataframe (pickled).\n"
                "The dataframe is stored as '{}'".format(out_name))


# String formatting
# my fav resources are https://pyformat.info/  and   https://mkaz.blog/code/python-string-format-cookbook/
## Example with named placeholders
next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
                "\t{length}\tblack".format(
                species_code=species_code,chrom=chrom, length=length))
# see https://mkaz.blog/code/python-string-format-cookbook/ for percentage and Exponent notation,
# although I tend to prefer capital E & that can easily be done by replacing the lowercase
# one that is shown with upper case like:
print("{:.2E}".format(3.1415926)) #results in `3.14E+00`
print("{:.3E}".format(602213969921133261473164)) #results in `6.022E+23`
print("Half is {:.2%}".format(0.5000000)) #`.2%` limits to two decimals


# ternary operator if-or conditional (a if condition else b) for setting a variable
direction_string = "positive" if direction > 0 else "negative"


# Try except
try:
  return urlopen(url).read()
except HTTPError as e:
  #print e.code
  #print e.msg
  return "HTTPError"


# write to file
# prepare output file for saving so it will be open and ready
with open(output_file_name, 'w') as output_file:
    for indx,(chrom,length) in enumerate(chromosomes_and_length.items()):
        next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
            "\t{length}\tblack".format(
            species_code=species_code,chrom=chrom, length=length))
        if indx < (len(chromosomes_and_length)-1):
            next_line += "\n" # don't add new line character to last line
        # Send the built line to output
        output_file.write(next_line)
# provide feedback
sys.stderr.write( "\n\nThe karyotype file for {} chromosomes has been saved "
        "as a file named"
        " '{}'.".format(len(chromosomes_and_length),output_file_name))

# https://twitter.com/bbelderbos/status/1534784930005278720.   June 2022
#>"Once you have a pathlib.Path object you don't have to use a context manager to write to a file, it already comes with write_bytes() / write_text() methods that can do this."
filepath.write_text(resp,txt)

# also see 'Read and write to a file' above


# decoding and undecoding a string as base64, based on https://stackabuse.com/encoding-and-decoding-base64-strings-in-python/ (note could also use zlib (see https://stackoverflow.com/a/29243206/8508004) but it seemed like make things take up more space/looks messy in the script then the original table. Whereas base64 seemed a little less expansive and cleaner.)
# The idea of doing this was to make a table take up less lines of a script when hardcoding what would normally be data in a separate tsv file into a script. So hardcode it in but in less lines
import base64
message = wt_bendit_result
message_bytes = message.encode('ascii')
base64_bytes = base64.b64encode(message_bytes)
base64_message = base64_bytes.decode('ascii')
#base64_message is the string as based64

#UNDO CONVERSION
undecodedbase64 = base64.b64decode(base64_message)
# use `undecodedbase64 = undecodedbase64.decode()` after that if need
# as a string and using `base64_message = '''dGhpcyBpcyBhIHRlc3Q='''` to
# hardcode iinto a script. because with just above that will produce bytes from
# `base64_message = '''dGhpcyBpcyBhIHRlc3Q='''` .
print(base64_message)
```


#Suppressing stdout / stderr when using imported function or subprocess, etc.
#Vaguely similar to using `%%capture` to hush noisy output /code in notebooks.
#This is very useful when you are importing a function or running internal code as a subprocess and what it is saying in stderr feedback may be wrong or moot because of the way you are processing what function returns.
# based on https://stackoverflow.com/a/52442331/8508004
# also see `bendIt_analysis.py` where I do it slightly differently (with `with io.capture_output() as captured:`)
# within a python script because running in a juptyer environment.

from contextlib import contextmanager,redirect_stderr,redirect_stdout
from os import devnull

@contextmanager
def suppress_stdout_stderr():
    """
    A context manager that redirects stdout and stderr to devnull.
    From https://stackoverflow.com/a/52442331/8508004
    """
    with open(devnull, 'w') as fnull:
        with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
            yield (err, out)

#...
with suppress_stdout_stderr():
    pm_df = patmatch_results_to_df(result)


# Python version of curl is in useful_ipython_to_python_snippets.py at https://gist.github.com/fomightez/ed79e33e97601d839dd550fd224d583c

# Python version of checking software to run in shell is installed in environment is in useful_ipython_to_python_snippets.py at https://gist.github.com/fomightez/ed79e33e97601d839dd550fd224d583c