Skip to content

Instantly share code, notes, and snippets.

@fomightez
Last active September 15, 2023 09:59
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save fomightez/ef7583919dde51f3569731ca1c5247ba to your computer and use it in GitHub Desktop.
Save fomightez/ef7583919dde51f3569731ca1c5247ba to your computer and use it in GitHub Desktop.
Useful Python snippets
# These are meant to work in both Python 2 and 3, except where noted.
# See my useful_pandas_snippets.py for those related to dataframes (such as pickling/`df.to_pickle(save_as)`)
# https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4
# also see https://gist.github.com/fomightez/324b7446dc08e56c83fa2d7af2b89a33 for examples of my
# frequently used Python functions and slight variations for more expanded, modular structures.
#argparse
# good snippet collection at https://mkaz.tech/code/python-argparse-cookbook/
# positional
parser.add_argument("input", help="Name of the file that was \
generated by other program \
when run with your transcriptome of interest.", metavar="INPUT_FILE")
# with optional positional
parser.add_argument("input", nargs='?', help="**OPTIONAL**Name of the file \
generated by other program \
when run with your transcriptome of interest. Usually, this is \
'"+input_file_name_default+"' &\
if no input file name is provided then this will be used by \
default.", default=input_file_name_default, metavar="INPUT_FILE")
# Note see https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse#comment35222484_18863004
# for why not using `argparse.FileType` approach here.
# See
# https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments
# and
# https://docs.python.org/2/library/argparse.html#nargs for use of `nargs='?'`
# to make input and output file names optional. Note that the square brackets
# shown in the usage out signify optional according to
# https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments#comment40460395_4480202
# , but because placed under positional I added clarifying text to help
# description.
# IF MODIFYING THIS SCRIPT FOR USE ELSEWHERE AND DON'T NEED/WANT THE OUTPUT
# FILE TO BE OPTIONAL, remove `nargs` (& default?) BUT KEEP WHERE NOT
# USING `argparse.FileType` AND USING `with open` AS CONSIDERED MORE PYTHONIC.
# With list where won't know the exact size of list, i,e. could be one (Example from `plot_expression_across_chromosomes.py`; see `donut_plot_with_subgroups_from_dataframe.py` for another)
parser.add_argument('-chr', '--chrs', action='store', type=str,
help="use this flag to limit plotting of the data to particular \
chromosomes or scaffolds you specify immediately following this flag. \
Separate the chromosome or scaffold identifiers by commas, without spaces. \
Example use in a command is `--chrs I,IV,XVI`. \
Default when this optional flag is not called is to plot that data for all \
chromosomes or scaffolds. ") # based on
# https://stackoverflow.com/questions/15753701/argparse-option-for-passing-a-list-as-option
# ; specifically, https://stackoverflow.com/a/24866869/8508004
#...
if args.chrs:
if "," in args.chrs:
limit_to_chrs = args.chrs.split(',')
else:
# means only one item
limit_to_chrs = [args.chrs] #has to be a list for passing to Pandas `isin()`
# with flag parameters
parser.add_argument('-sa', '--save_as', action='store', type=str,
default= generate_output_file_name(previous_pickled_df), help="Use \
this option to supply a name of \
the file to save for storing produced dataframe. If none is provided, \
the name \'"+generate_output_file_name(previous_pickled_df)+"' will be \
used. To force nothing to be saved, enter \
`-sa no_output` without quotes as output file (ATYPICAL).")
# with choices
# see https://stackoverflow.com/a/35970231/8508004
# or https://stackoverflow.com/a/15301183/8508004
# or https://stackoverflow.com/questions/40324356/python-argparse-choices-with-a-default-choice
parser.add_argument("-og", "--output_grouping", type=str,
default= "single", choices=["single", "separate", "both"],
help="OPTIONAL: Specify grouping of output with this option. Choose \
`-og single` for one table or dataframe for all categories. Or choose \
`-og separate` for a separate table or dataframe for each category. \
Or specify `-og both` to output both types. \
If this option is not specified, {} will be used.".format("single"))
# Now DISFAVORED approach for reading in files. (disfavored because use of `with` favored and not compatible with that. So only use when really in a hurry to scrape something together; see https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse#comment35222484_18863004)
parser.add_argument("MTA", help="Name of file containing data. REQUIRED.", type=argparse.FileType('r'), metavar="FILE")
#I would also like trigger help to display if no arguments provided because need at least one input file
if len(sys.argv)==1: #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu
parser.print_help()
sys.exit(1)
args = parser.parse_args()
the_file = args.MTA
# open input file and start reading
sys.stderr.write("\nReading input file...")
#input_file_stream = open(the_file, "r") # Don't need separate open when use `type=argparse.FileType`. It sets everything up automatically and you will actually cause errors if try to open when already open.
for line in thefile:
pass
# assert
assert len(numbers_given_for_start_n_end) == len(aln_ids_in_out_order), (
"The user-supplied list must be equal in length to list of data"
"previously supplied as 'aln_ids_in_out_order'.")
# Verify that values in dictionary are equal in length using `count` method of Python lists
lengths_of_sequences = [len(v) for v in sequence_dict.values()]
assert lengths_of_sequences.count(
lengths_of_sequences[0]) == len(lengths_of_sequences), "The length "
"of all parsed sequences should be the same." # see where
# that assertion test involving Ivo van der Wijk's solution from
# https://stackoverflow.com/a/3844948/8508004
# Getting hex color codes and RGB values from out of Seaborn color palettes / show colors
# based on https://stackoverflow.com/questions/38249454/extract-rgb-or-6-digit-code-from-seaborn-palette
# WORKS IN A JUPYTER NOTEBOOK CELL
import seaborn as sns
#num_shades = 8
#sns.palplot(sns.cubehelix_palette(num_shades))
pal = sns.color_palette("RdBu_r")
print(pal.as_hex())
print(pal)
sns.palplot(sns.color_palette("RdBu_r"))
sns.palplot(sns.diverging_palette(5, 250))
# Check if string can be cast to number and cast (example from `donut_plot_with_subgroups_from_dataframe.py`)
def is_number(s):
'''
check if a string can be cast to a float or numeric (integer).
Takes a string.
Returns True or False
fixed from https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/
later noted similar code is at https://code-maven.com/slides/python-programming/is-number
'''
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
def cast_to_number(s):
'''
Cast a string to a float or integer.
Tries casting to float first and if that works then it tries casting the
string to an integer. (I thought I saw suggestion of that order somewhere
when searching for what I used as `is_number()` check but cannot find source
right now.)
Returns a float, int, or if fails, False. (Where using, it shouldn't ever
trigger returning `False` because checked all could be converted first.)
based on fixed code from https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/
'''
try:
number = float(s)
try:
number = int(s)
return number
except ValueError:
pass
return number
except ValueError:
pass
try:
import unicodedata
num = unicodedata.numeric(s)
return num
except (TypeError, ValueError):
pass
return False
# count frequency
def count_frequency_in_list(l):
'''
takes a list and returns a dictionary of the counts of the items in that
list
based on https://stackoverflow.com/a/2162045/8508004
'''
import collections
return collections.Counter(l)
clusters_nofeat = clusters_nofeat_df["Acluster(+)"].tolist() + clusters_nofeat_df["Acluster(-)"].tolist()
# split up those that contain more than one, i.e., separated by comma, and strip whitespace;
# expanded list comprehension from https://stackoverflow.com/a/27886807/8508004
clusters_nofeat = [x.strip() for xs in clusters_nofeat for x in xs.split(',')]
# remove the blank ones /empty ones (this is how you remove empty strings from a list) ; it can also be used to remove blank likes from splits on `"\n"`
clusters_nofeat = [x for x in clusters_nofeat if x]
# remove blank lines & lines that are just tabs and empty spaces (happens to be from a different example than most of these 'remove' examples in this section
input_data = [x for x in input_data.split("\n") if x.strip()]
# remove duplicates
clusters_nofeat = set(clusters_nofeat)
# remove the dash and everything after to count frequency
clusters_nofeat = [x.split("-")[0] for x in clusters_nofeat]
# count frequency
counts=count_frequency_in_list(clusters_nofeat)
clusters_nofeat
# next line based on #based on https://stackoverflow.com/questions/2161752/how-to-count-the-frequency-of-the-elements-in-a-list#comment46593992_2162045
# ; otherwise get `Counter()` type
dict(counts)
most_common,num_most_common = Counter(some_list).most_common(1)[0] # based on
# https://stackoverflow.com/a/6987358/8508004
#unique items in list that occur more than two times (or some other number you specify)
the_count = Counter(some_list)
print ([k for k, v in the_count.items() if v > 2]) # based
# on https://stackoverflow.com/a/26773120/8508004 and
# https://stackoverflow.com/a/30418498/8508004 , to work in 2.7 and 3
# incrementing count by instance while interating on a list (could be applied to a dataframe column too)
# based on https://stackoverflow.com/a/1692428/8508004
from collections import defaultdict
l = ['apple','cherry','apple','cherry','cherry','pear']
d = defaultdict(int)
for x in l:
d[x] += 1
print ("That is {} #{}".format(x,d[x]))
print("final counts: {}".format(dict(d)))
# count the number of non-overlapping occurrences of a substring
num_sequences = file_listing_text.count('.fa')
#combinining counting and regular expressions (regex)
# count frequency of blocks of Ns in a string (presumably sequence)
import re
from collections import defaultdict
t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN"
matches = []
len_match_dict = defaultdict(int)
min_number_Ns_in_row_to_collect = 1
pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I) # adpated from
# code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on
# https://stackoverflow.com/a/250306/8508004
for m in pattern_obj.finditer(t):
len_match_dict[len(m.group())] += 1
matches.append(m.group())
print(len_match_dict)
print(collections.Counter(matches))
#copy a list
# Important if you'll be itersting on it a modifying it at same time. Modify the copy or iterate on the copy but not both.
# Also useful if you want to start a new list with contents of old. If you just do `new_list = old_list`, that just copies
# the reference to the list, see https://stackoverflow.com/a/2612815/8508004 . Won't probably be what you want.
new_list = old_list.copy()
# Split a list into chunks with also collecting any remainder group
# from https://stackoverflow.com/a/312464/8508004 ===> UPDATE For 2023 onward: in version 3.12, Python added `itertools.batched()` that chunks, see https://twitter.com/patloeber/status/1613190998389030913
def chunks(a_list, n):
"""Yield successive n-sized chunks from list."""
for i in range(0, len(a_list), n):
yield a_list[i:i+n]
print (list(chunks(range(10, 75), 10)))
# also see [`more_itertools.chunked()`](https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.chunked) that I learned of from https://twitter.com/PeakSquirrel/status/1574893067693215744
# there is another approach with list comprehension for a string in this gist that is based
# on https://stackoverflow.com/a/13673133/8508004 and looks similar to list comprehension one for
# lists at https://stackoverflow.com/a/32467096/8508004
# RELATED: chunking with step/stride of certain amount to produce overlap. For example to reproduce process
# described in this figure legend: " The data were calculated from 150-base pair windows with 50 base pairs of overlap between adjacent windows and plotted ..."
a = "adhaskjhdjkashdajkshdkjashdaslalllslslslsaashdahs"
chunk_size = 10
step_size = 5
def chunk_string_with_different_step(string, chunk_size, step_size):
"""Return a list of n-sized chunks from string of letters."""
return [string[i:i+chunk_size] for i in range(0, len(string),step_size)]
chunk_string_with_different_step(a,chunk_size,step_size)
'''
#RESULT:
['adhaskjhdj',
'kjhdjkashd',
'kashdajksh',
'ajkshdkjas',
'dkjashdasl',
'hdaslallls',
'alllslslsl',
'lslslsaash',
'saashdahs',
'dahs']
'''
# AS GENERATOR:
b = "adhaskjhdjkashdajkshdkjashdaslalllslslslsaashdahs"
def gen_chunk_string_with_different_step(a_list, chunk_size, step_amount):
"""Yield successive n-sized chunks from list, stepping /stride by step_amount."""
for i in range(0, len(a_list), step_amount):
yield a_list[i:i+chunk_size]
print (list(gen_chunk_string_with_different_step(b,chunk_size,step_size)))
'''
#RESULT:
['adhaskjhdj', 'kjhdjkashd', 'kashdajksh', 'ajkshdkjas', 'dkjashdasl', 'hdaslallls', 'alllslslsl', 'lslslsaash', 'saashdahs', 'dahs']
'''
# Debugging
# Insert the following into a script at the point you'd like to debug to bring up interactive IPyton console
# you can query for status of defined variables:
from IPython import embed; embed()
#dictionary
for key in d:
pass
for key, value in d.items():
pass
if key in d:
pass
# some example dictionary comprehension ( general idea: d = {k:v for k,v in a.items()} ) are at
# https://stackoverflow.com/questions/1031851/python-best-way-to-exchange-keys-with-values-in-a-dictionary
# in answer to "Python: Best Way to Exchange Keys with Values in a Dictionary?"
# merge two dictionaries --> see https://www.geeksforgeeks.org/python-merging-two-dictionaries/
d = {**dict1, **dict2} # or d = (dict2.update(dict1))
# do-while loop (https://stackoverflow.com/a/1662176/8508004)
while True:
do_something()
if condition():
break
# get file extension from file name
# Now I'd use Pathlib, see https://docs.python.org/3/library/pathlib.html `Path(filename_n_path).suffix`
#; for main part using `Path(filename_n_path).stem` see more about Pathlib use in this document under 'Pathlib' below
def generate_output_file_name(file_name,suffix_for_saving):
'''
Takes a file name as an argument and returns string for the name of the
output file. The generated name is based on the original file
name.
Specific example
=================
Calling function with
("sequence.fa", "_col")
returns
"sequence_col.fa"
'''
main_part_of_name, file_extension = os.path.splitext(
file_name) #from
#http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
if '.' in file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it
return main_part_of_name + suffix_for_saving + file_extension
else:
return file_name + suffix_for_saving + ".fa"
# Floor
import math
x = int(math.floor(10.4)) #outer `int()` typecast in Python 3 is redundant
print (x)
# outer typecast insures same result (an integer) is returned in both
# Python 2 and 3 since Python 2 floor returns a float
# Not could get same by typecast to integer alone but use of 'floor' makes it more explicit as to what was sought.
# Get HTML / URL in Python 2 or 3
# Getting html originally for just Python 3, adapted from
# https://stackoverflow.com/a/17510727/8508004 and then updated from to
# handle Python 2 and 3 according to same link.
# (snippet with bonus Python 2 and 3 compatible variable unpacking and unicode decoding)
url = "http://www.example.org"
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
html = urlopen(url)
for line in html.read().splitlines():
#name, chrom_len, *_ = line.strip().split()
# that elegant unpack above is based on
# https://stackoverflow.com/questions/11371204/unpack-the-first-two-elements-in-list-tuple
# , but it won't work in Python 2. From same place, one that works in 2:
name, chrom_len = line.strip().split()[:2]
chrom_and_length[name.decode(
encoding='UTF-8')] = chrom_len.decode(encoding='UTF-8')
# generate names for sub-groups automatically, à la `subset_a`, `subset_b`, etc.
first_group_suffix = "a"
groupings = ["subset"+ chr(ord(first_group_suffix) + x) for x in range(groups_to_make)]
# `chr(ord(first_group_suffix) + x)` part of line above based on https://stackoverflow.com/a/2156898/8508004
# greater than or equal and less than or equal
assert seq_step_size <= shortest_feature_len, ("problem")
# >= for greater than or equal
# <= for less than or equal
# related use in interval comparison:
if 10000 <= number <= 30000:
pass
# check if any items in two lists are shared (any overlap / overlapping?)
bool(set(a) & set(b))
#list comprehension if and if-else conditionals
[x+1 if x >= 45 else x+5 for x in l]
# FOR JUST IF:
#"The if should be after the for (unless it is in an if-else ternary operator)
[y for y in a if y not in b]
#This would work however:
[y if y not in b else other_value for y in a]
#from https://stackoverflow.com/a/15474969/8508004
#My edited version of "just if" variation:
[x for x in list_for_order if x in b]
# List comprehension to add two items for each initial item in a list
# This example adds the number in the list plus the next value in the series
# for each item in a list.
#based on Games Brainiac comment at https://stackoverflow.com/a/19466238/8508004 and DSM's
# comment at https://stackoverflow.com/a/11869360/8508004
l = [1,2,4]
[item for x in l for item in [x, x+1]] # results in `[1, 2, 2, 3, 4, 5]`
# see related information about `s.shift()` for getting a row and the next one in pandas snippets
# plus see some nice list comprehensions in the 'count frequency' section above
# for the `count_frequency_in_list()` function
# plus there are these:
#remove all the blanks, i.e. `''` entries, in the sub-lists, and then
element_record[0] = [x for x in element_record[0] if x]
element_record[1] = [x for x in element_record[1] if x]
#join the contents that remain with commas to make a single string of ids
element_record[0] = ", ".join(element_record[0])
element_record[0] = ", ".join(element_record[1])
# -or- in one step, DO BOTH
#remove all the blanks, i.e. `''` entries, in the sub-lists, and then
#join the contents that remain with commas to make a single string of ids
element_record[0] = ", ".join([x for x in element_record[0] if x])
elementr_record[1] = ", ".join([x for x in element_record[1] if x])
# make a copy of a list with a specific item removed
first_characters_wo_candidate = [x for x in first_characters if x != basic_tag] #based
# on https://stackoverflow.com/a/25004389/8508004 (this will remove ALL occurences);
# I couldn't find a way to both make a copy of the list and remove the first instance of that
# item using `.remove(item)` using Python 2.7. Always had to copy list and then use remove. Although
# potentially this looks like one line if have numpy already imported --> https://stackoverflow.com/a/50313691/8508004
# " one-liner providing both the value of the minimum as well as the first index where it is realized"
# from https://coderwall.com/p/a9hvrg/index-of-minimum-element-of-a-list where describes
# works because the default order of tuples in Python is lexicographical.
# Approach works for `max` (maximum) too.
# Needed slight adjusting to not use xrange for Python 3/2 compatibility.
l = [33, 788, 1, -14, 78, 11, 32, 11, 78, -1, -14]
mn,idx = min( (l[i],i) for i in range(len(l)) )
mn,idx
# next() with intertools
from itertools import cycle
#...
colors = (['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'xkcd:magenta', 'xkcd:orange',
'tab:gray','tab:pink'])
#...
colors = cycle(colors)
#...
color = next(colors)
# works in 2 and 3, see https://stackoverflow.com/questions/5237611/itertools-cycle-next
# Dealing with detecting if something was in region/ interval
total_feature_levels = max([v["level"] for v in features_dict.values()])
#...
loc_intvls_of_features = {(
v["start"],v["end"]): k for k,v in features_dict.items()} #dict of the intervals
# (keys) of sequence where features(values) are located
#...
# Use `loc_intvls_of_features` and get list of features where the
# intervals contain what corresponds to the current column. Three
# categories: at_start, in_mid_interval, at_end. Could sort later
# for coordinating stylistic settings but might as well do now
# since all information at hand.
at_start=[]
in_mid_interval = []
at_end = []
for interval,feature in loc_intvls_of_features.items():
if interval[0] == curr_residue_number:
at_start.append(feature)
elif interval[0] < curr_residue_number < interval[1]:
in_mid_interval.append(feature)
elif interval[1] == curr_residue_number :
at_end.append(feature)
# Next if there are contents in the feature lists, cycle through them
# stylizing table cells of current column appropriately.
for feature in at_start:
stylize_start_features(feature, column, features_part)
for feature in in_mid_interval:
stylize_mid_features(feature, column, features_part)
for feature in at_end:
stylize_end_features(feature, column, features_part)
def overlap_exists(a, b):
'''
takes two intervals and returns if they overlap. The intervals would
be defined by tuples of (start, stop).
Examples:
overlap_exists([20, 38],[1, 125])
> True
overlap_exists([10, 15], [20, 38])
> False
overlap_exists([10, 25], [20, 38])
> True
overlap_exists([10, 25], [25, 38])
> False
based on https://stackoverflow.com/a/2953979/8508004 and fact 0 is same
as False and anything else is True.
Modified to add the +1 because I want to be inclusive so even one
shared basepair is an overlap
'''
return bool(max(0, (min(a[1], b[1]) - max(a[0], b[0])+1)))
# left justify by adding spaces. Really useful when creating multiple sequence alignments to control columns things show up in.
# based on https://stackoverflow.com/a/5676676/8508004
mviewlines_dict[i][0] = mviewlines_dict[i][0].ljust(len_longest_id)
# Minimum size of string in a list
print len(min(lizt_o_strings, key=len)) #based on https://stackoverflow.com/a/7228951/8508004
# operating system/shell dealings
# os.remove() to delete a file
# make a directory
import os, errno
try:
os.makedirs(directory)
except OSError as e:
if e.errno != errno.EEXIST:
raise
# move a file
shutil.move("path/to/current/file.foo", "path/to/new/destination/for/file.foo")
#for more see [Replacing Bash Scripting with Python](https://github.com/ninjaaron/replacing-bash-scripting-with-python)
#see `fnmatch` examples in the useful notebook snippets gist.
# see `glob.glob` examples of matching file extensions and filenames, etc., in the useful notebook snippets gist.
# Pathlib
# in addition to this see `pathlib` examples in useful notebook snippets gist.
#Pathlib in Python 2 or 3 example:
try:
from pathlib import Path
except ImportError:
from pathlib2 import Path
# list all files in a directory
[item for item in Path('.').glob('*')] # based on
# https://jefftriplett.com/2017/pathlib-is-wonderful/
# list final file extension , see 'Path(filenmae_n_path).suffix' at
#https://docs.python.org/3/library/pathlib.html
[item.suffix for item in Path('.').glob('*')]
# list the final suffixes if there is more than one - see 'Path.suffixes' at
#https://docs.python.org/3/library/pathlib.html
# main part without the extension is 'Path(filenmae_n_path).stem'
# Perform a function or carry out a calculation a certain percent of the time (i.e., with a random frequency)
import random
if random.random() < 0.8:
do_something()
# example using it:
def get_unique_tag(string, length_of_tags):
'''
Takes a string and generates a unique tag of provided length
related to the provided string.
Alphanumerics in tag will be limited to those in the provided
string. Although the same order is favored, there
is a chance of returning characters out of order using
shortuuid.
shortuuid from https://github.com/skorokithakis/shortuuid
'''
import random
if random.random() < 0.95:
chunk_size = length_of_tags
chunks = [string[i:i+chunk_size] for i in range(0, len(string),chunk_size)] #based on
# https://stackoverflow.com/a/13673133/8508004 or https://stackoverflow.com/a/9475354/8508004
'''
# WRITEN AS A FUNCTION
def chunk_string(string, chunk_size):
"""Return a list of n-sized chunks from string of letters."""
return [string[i:i+chunk_size] for i in range(0, len(string),chunk_size)]
'''
# discard chunks too short
chunks = [x for x in chunks if len(x)== length_of_tags]
import random
return random.choice(chunks).lower()
else:
set_alphabet(string)
return uuid()[:length_of_tags].lower()
# Pickle Python objects for storage
# for simple objects json suggested by Martijn Pieters at https://stackoverflow.com/q/25464295/8508004
# as lighter-weight and more portable than Python's pickling. (Same page has a similar pickling
# example actually too under Mike McKerns answer. Good for a list of strings, for example:
# Save as json
import json
with open('filename_list.json', 'w') as f:
json.dump(CTD_seqs_fn_list, f)
#Read as json
import json
with open('filename_list.json', 'r') as f:
filename_list = json.load(f)
# Subset to a random sampling of items in a list , based on https://pynative.com/python-random-sample/
import random
genomes = random.sample(population=genomes, k=15)
# Read and write to a file
# prepare output file for saving so it will be open and ready
# NOTE ABOUT THE READING PART OF THIS: seems more modern Pythonic way
# is to leave out the `,'r'` part. See https://stackabuse.com/read-a-file-line-by-line-in-python/ under
# 'Read a File Line-by-Line with a for Loop - Most Pythonic Approach'. Note also that
# best to use `.strip()` or possibly slice `[:-1]` to remove line ending if going to
# rearrange because can get weird merge if alter order because usually last line will not have a new
# line character. Also see https://stackoverflow.com/a/3277516/8508004 for code that will read the entire file
# into memory, or or line by line ,and remove all whitespace characters (newlines and spaces) from the end of each line.
with open(output_file_name, 'w') as output:
# read in the input file; REMAMEBER BEST NOT TO USE `with open(input_file_name, 'r') as input:` BECAUSE `input()` is something in python you may want to use in your script later
with open(input_file_name, 'r') as input_handle: # OR SEE NOTE ABOVE HOW DON'T NEED `, 'r'` anymore.
# prepare to give feeback later or allow skipping to certain start
lines_processed = 0
# This gff3 doesn't have pertinent lines until line 10
for line in input_handle:
lines_processed += 1
# This gff3 doesn't have pertinent lines until line 10
if line.startswith("#") or lines_processed < 9:
# Send text to output
output.write(line)
else:
info = line.split("\t")
#print(info)
info[3] = str(adjust_pos(int(info[3]),ATP6_start_pos, chromosome_length))
info[4] = str(adjust_pos(int(info[4]),ATP6_start_pos, chromosome_length))
#print (info) # ONLY FOR DEBUGGING
# Send text to output
output.write(("\t").join(info))
# Feedback
sys.stderr.write("Positions were changed to match ATP6 as start "
"and saved in '{}'.".format(output_file_name))
# Read entire file into memory at once(extra bouns example removes linebreaks to make a a long string) /Read all file at once
with open('data.txt', 'r') as myfile:
data=myfile.read().replace('\n', '')
# REplace text in a file combining much of above approaches
script_name = "donut_plot_with_subgroups_from_dataframe.py"
def change_original_title(s, script_name):
'''
Change the plot title to the provided text in the file designated with `script_name`
'''
with open(script_name, 'r') as thefile:
script=thefile.read()
script = script.replace('BREAKDOWN', s)
with open(script_name, 'w') as output_file:
output_file.write(script)
change_original_title("NEW TITLE GOES HERE", script_name)
# Sort / Sorting
# FOR PYTHON FOR GENOMIC DATA SCIENCE I WAS LOOKING TO SORT A REPRESENTATION OF A DICTIONARY BASED ON VALUES
# see http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
import operator
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
sorted_x = sorted(x.items(), key=operator.itemgetter(1))
#Later I found is same without need for operator (and know to work in Python 3).:
sorted_keys = sorted(my_dict, key=my_dict.get) #based on https://stackoverflow.com/a/37270275/8508004
# to sort keys from dictionary based on value
# see my file `sorting out sorting on attributes within lists using keys.md` for more along these lines
# StringIO
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
df = pd.read_table(StringIO(Input), header=0, delim_whitespace= True)
# verified for both Python 2 and 3
# stderr.write
import sys
sys.stderr.write("\n\nThe dataframe was not stored for use elsewhere "
"because `no_output` was specified in place of the output file name.")
sys.stderr.write( "\n\nThe dataframe has been saved as a file in a "
"manner where other Python programs can access\nthe created "
"dataframe (pickled).\n"
"The dataframe is stored as '{}'".format(out_name))
# String formatting
# my fav resources are https://pyformat.info/ and https://mkaz.blog/code/python-string-format-cookbook/
## Example with named placeholders
next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
"\t{length}\tblack".format(
species_code=species_code,chrom=chrom, length=length))
# see https://mkaz.blog/code/python-string-format-cookbook/ for percentage and Exponent notation,
# although I tend to prefer capital E & that can easily be done by replacing the lowercase
# one that is shown with upper case like:
print("{:.2E}".format(3.1415926)) #results in `3.14E+00`
print("{:.3E}".format(602213969921133261473164)) #results in `6.022E+23`
print("Half is {:.2%}".format(0.5000000)) #`.2%` limits to two decimals
# ternary operator if-or conditional (a if condition else b) for setting a variable
direction_string = "positive" if direction > 0 else "negative"
# Try except
try:
return urlopen(url).read()
except HTTPError as e:
#print e.code
#print e.msg
return "HTTPError"
# write to file
# prepare output file for saving so it will be open and ready
with open(output_file_name, 'w') as output_file:
for indx,(chrom,length) in enumerate(chromosomes_and_length.items()):
next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
"\t{length}\tblack".format(
species_code=species_code,chrom=chrom, length=length))
if indx < (len(chromosomes_and_length)-1):
next_line += "\n" # don't add new line character to last line
# Send the built line to output
output_file.write(next_line)
# provide feedback
sys.stderr.write( "\n\nThe karyotype file for {} chromosomes has been saved "
"as a file named"
" '{}'.".format(len(chromosomes_and_length),output_file_name))
# https://twitter.com/bbelderbos/status/1534784930005278720. June 2022
#>"Once you have a pathlib.Path object you don't have to use a context manager to write to a file, it already comes with write_bytes() / write_text() methods that can do this."
filepath.write_text(resp,txt)
# also see 'Read and write to a file' above
# decoding and undecoding a string as base64, based on https://stackabuse.com/encoding-and-decoding-base64-strings-in-python/ (note could also use zlib (see https://stackoverflow.com/a/29243206/8508004) but it seemed like make things take up more space/looks messy in the script then the original table. Whereas base64 seemed a little less expansive and cleaner.)
# The idea of doing this was to make a table take up less lines of a script when hardcoding what would normally be data in a separate tsv file into a script. So hardcode it in but in less lines
import base64
message = wt_bendit_result
message_bytes = message.encode('ascii')
base64_bytes = base64.b64encode(message_bytes)
base64_message = base64_bytes.decode('ascii')
#base64_message is the string as based64
#UNDO CONVERSION
undecodedbase64 = base64.b64decode(base64_message)
# use `undecodedbase64 = undecodedbase64.decode()` after that if need
# as a string and using `base64_message = '''dGhpcyBpcyBhIHRlc3Q='''` to
# hardcode iinto a script. because with just above that will produce bytes from
# `base64_message = '''dGhpcyBpcyBhIHRlc3Q='''` .
print(base64_message)
```
#Suppressing stdout / stderr when using imported function or subprocess, etc.
#Vaguely similar to using `%%capture` to hush noisy output /code in notebooks.
#This is very useful when you are importing a function or running internal code as a subprocess and what it is saying in stderr feedback may be wrong or moot because of the way you are processing what function returns.
# based on https://stackoverflow.com/a/52442331/8508004
# also see `bendIt_analysis.py` where I do it slightly differently (with `with io.capture_output() as captured:`)
# within a python script because running in a juptyer environment.
from contextlib import contextmanager,redirect_stderr,redirect_stdout
from os import devnull
@contextmanager
def suppress_stdout_stderr():
"""
A context manager that redirects stdout and stderr to devnull.
From https://stackoverflow.com/a/52442331/8508004
"""
with open(devnull, 'w') as fnull:
with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
yield (err, out)
#...
with suppress_stdout_stderr():
pm_df = patmatch_results_to_df(result)
# Python version of curl is in useful_ipython_to_python_snippets.py at https://gist.github.com/fomightez/ed79e33e97601d839dd550fd224d583c
# Python version of checking software to run in shell is installed in environment is in useful_ipython_to_python_snippets.py at https://gist.github.com/fomightez/ed79e33e97601d839dd550fd224d583c
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment