Last active
August 17, 2023 22:13
-
-
Save elowy01/f062f9a51893377ac93ef8eb4f35629f to your computer and use it in GitHub Desktop.
Python cheat sheet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#PyPI | |
Is the module repository in Python (the equivalent to CPAN in Perl) | |
// | |
*Initializing 2 variables at the same time: | |
a=b=0 | |
or | |
a=b=0.0 | |
// | |
#Python shebang: | |
#!/usr/bin/env python (for python 2.7 latest) | |
#!/usr/bin/env python3 (for python 3.latest) | |
# Python style guide: | |
https://www.python.org/dev/peps/pep-0008/ | |
*Class Names: | |
Class names should normally use the CapWords convention | |
*Funcƒtion Names; | |
Function names should be lowercase, with words separated by | |
underscores as necessary to improve readability. | |
*Function params: | |
The most followed convention is the one used in NumPy (described at | |
https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt) | |
Parameters | |
---------- | |
x : type | |
Description of parameter `x`. | |
y | |
Description of parameter `y` (with type not specified) | |
#to print | |
print "hello"; | |
/ | |
#python naming conventions | |
-Class names start with an uppercase letter. All other identifiers start with a lowercase letter. | |
-Starting an identifier with a single leading underscore indicates that the identifier is private. | |
- Starting an identifier with two leading underscores indicates a strongly private identifier. | |
- If the identifier also ends with two trailing underscores, the identifier is a language-defined special name. | |
#assign values to variableswh | |
counter = 100 # An integer assignment | |
miles = 1000.0 # A floating point | |
name = "John" # A string | |
// | |
#Object introspection: | |
my_list = [1, 2, 3] | |
dir(my_list) | |
#gave us the names of all the methods of a list. This can be handy when you are not able to recall a method name. | |
/ | |
#using type: | |
print(type('')) | |
# Output: <type 'str'> | |
/ | |
#Using Inspect: | |
import inspect | |
print(inspect.getmembers(str)) | |
#multiple assignment | |
a = b = c = 1 | |
a, b, c = 1, 2, "john" | |
#increment operator | |
a+=2 | |
#print variables | |
print counter | |
print miles | |
print name | |
#Standard Data Types | |
Numbers | |
String | |
List | |
Tuple | |
Dictionary | |
/ | |
#knowing the data type of some object: | |
type(whatever) | |
#Python strings: | |
str = 'Hello World!' | |
print str[0]; #print H | |
print str[1:3]; #print el | |
print str[2:]; #print llo (everything from the 2nd character) | |
print str*2; #prints hellohello | |
print str1+str2 #concatenating 2 strings | |
#concatenating a string and a number, where b is the number: | |
str=a+str(b) | |
#python lists: | |
list = [ 'abcd', 786 , 2.23, 'john', 70.2 ] | |
print list # Prints complete list | |
print len(list) #print length of list | |
print list[0] # Prints first element of the list | |
print list[1:3] # Prints elements starting from 2nd to 4th | |
print list[2:] # Prints elements starting from 3rd element | |
print tinylist * 2 # Prints list two times | |
print list + tinylist # Prints concatenated lists | |
#adding a prefix to each element on a list: | |
alist = ['foo','spam', 'bar'] | |
prefix='pref' | |
newlist=[prefix+elt for elt in alist] | |
#print join a list | |
mylist = ['spam', 'ham', 'eggs'] | |
print ','.join(mylist) | |
*Remove an element's first occurrence in a list | |
>>> a = ['a', 'b', 'c', 'd'] | |
>>> a.remove('b') | |
>>> print a | |
['a', 'c', 'd'] | |
#remove duplicated elements from list | |
mylist = ['spam', 'ham', 'eggs'] | |
set(mylist) | |
// | |
#sets in python. | |
#read list of words in a file into a set: | |
my_set = set(open('all_runs.ega.txt')) | |
/ | |
#this will contain a trailing \n, so we need to do the following: | |
set(line.strip() for line in open('filename.txt')) | |
// | |
#A good way of checking what elements there is in one list and not the | |
other and vice versa: | |
To find the intersection (items that are in both sets): | |
>>> a = set([1, 2, 3, 4, 5, 6]) | |
>>> b = set([4, 5, 6, 7, 8, 9]) | |
>>> a & b | |
set([4, 5, 6]) | |
To find the difference (items that only in one set): | |
>>> a = set([1, 2, 3, 4, 5, 6]) | |
>>> b = set([4, 5, 6, 7, 8, 9]) | |
>>> a - b | |
set([1, 2, 3]) | |
>>> b - a | |
set([7, 8, 9]) | |
To find the symmetric difference (items that are in one or the other, but not both): | |
>>> a = set([1, 2, 3, 4, 5, 6]) | |
>>> b = set([4, 5, 6, 7, 8, 9]) | |
>>> a ^ b | |
set([1, 2, 3, 7, 8, 9]) | |
/ | |
*Adding elements to a set | |
>>> a.add(7) | |
// | |
#python tuples: | |
A tuple is another sequence data type that is similar to the list. A tuple consists of a number of values separated by commas. Unlike lists, however, tuples are enclosed within parentheses. | |
Tuples can be thought of as read-only lists. | |
tuple = ( 'abcd', 786 , 2.23, 'john', 70.2 ) | |
tinytuple = (123, 'john') | |
print tuple # Prints complete list | |
print tuple[0] # Prints first element of the list | |
print tuple[1:3] # Prints elements starting from 2nd to 4th | |
print tuple[2:] # Prints elements starting from 3rd element | |
print tinytuple * 2 # Prints list two times | |
print tuple + tinytuple # Prints concatenated lists | |
// | |
#list of tuples | |
my_list = [ ('a', 1), ('b', 2), ('c', 3), ('d', 4)] | |
// | |
#iterating over list of tuples: | |
for j,k in my_list: | |
... print j | |
... print k | |
/ | |
*named tuples: | |
from collections import namedtuple | |
Point = namedtuple('Point', 'x y') | |
pt1 = Point(1.0, 5.0) | |
pt2 = Point(2.5, 1.5) | |
from math import sqrt | |
line_length = sqrt((pt1.x-pt2.x)**2 + (pt1.y-pt2.y)**2) | |
// | |
#Python Dictionary: | |
Python 's dictionaries are hash table type. They work like associative arrays or hashes found in Perl and consist of key-value pairs. | |
tinydict = {'name': 'john','code':6734, 'dept': 'sales'} | |
print dict['one'] # Prints value for 'one' key | |
print dict[2] # Prints value for 2 key | |
print tinydict # Prints complete dictionary | |
print tinydict.keys() # Prints all the keys | |
print tinydict.values() # Prints all the values | |
// | |
#compare 2 dictionaries | |
Make Two Dictionaries | |
importers = {'El Salvador' : 1234, | |
'Nicaragua' : 152, | |
'Spain' : 252 | |
} | |
exporters = {'Spain' : 252, | |
'Germany' : 251, | |
'Italy' : 1563 | |
} | |
Find Duplicate Keys | |
# Find the intersection of importers and exporters | |
importers.keys() & exporters.keys() | |
{'Spain'} | |
Find Difference In Keys | |
# Find the difference between importers and exporters | |
importers.keys() - exporters.keys() | |
{'El Salvador', 'Nicaragua'} | |
Find Key, Values Pairs In Common | |
# Find countries where the amount of exports matches the amount of imports | |
importers.items() & exporters.items() | |
{('Spain', 252)} | |
// | |
# Merge 2 dictionaries (dicts) | |
>>> x = {'a': 1, 'b': 2} | |
>>> y = {'b': 3, 'c': 4} | |
>>> z = {**x, **y} | |
>>> z | |
{'c': 4, 'a': 1, 'b': 3} | |
#When there are 2 overlapping keys then the right-hand key has precedence | |
// | |
#if | |
if expression: | |
statement(s) | |
#else | |
if expression: | |
statement(s) | |
else: | |
statement(s) | |
#elif | |
if expression1: | |
statement(s) | |
elif expression2: | |
statement(s) | |
elif expression3: | |
statement(s) | |
else: | |
statement(s) | |
#and operator | |
if (expression1 and expression2): | |
statement(s) | |
#while loop | |
while expression: | |
statement(s) | |
#for statement | |
for iterating_var in sequence: | |
statements(s) | |
ex: | |
l=[1,2,3,4,5,6] | |
for i in l: | |
print i | |
ex: | |
for i in 'caca': | |
print i | |
#break statement | |
for letter in 'Python': # First Example | |
if letter == 'h': | |
break | |
print 'Current Letter :', letter | |
var = 10 # Second Example | |
while var > 0: | |
print 'Current variable value :', var | |
var = var -1 | |
if var == 5: | |
break | |
print "Good bye!" | |
#continue statement | |
for letter in 'Python': # First Example | |
if letter == 'h': | |
continue | |
print 'Current Letter :', letter | |
var = 10 # Second Example | |
while var > 0: | |
print 'Current variable value :', var | |
var = var -1 | |
if var == 5: | |
continue | |
print "Good bye!" | |
// | |
''' | |
Iterate over the list using while loop | |
''' | |
i = 0 | |
sizeofList = len(wordList) | |
while i < sizeofList : | |
print(wordList[i]) | |
i += 1 | |
// | |
#defining a function and calling function | |
#!/usr/bin/python | |
# Function definition is here | |
def printme( str ): | |
"This prints a passed string into this function" | |
print str; | |
return; | |
# Now you can call printme function | |
printme("I'm first call to user defined function!"); | |
printme("Again second call to the same function"); | |
#opening a file for writing | |
>>>f = open('workfile', 'w') | |
>>>print f | |
<open file 'workfile', mode 'w' at 80a0960> | |
#opening a file for reading and printing lines | |
f=open('/Users/ernesto/supercont1.1v4.gff','r'); | |
for line in f: | |
print line, | |
// | |
# open a file if provided, if not then write to STDOUT | |
import sys | |
import contextlib | |
@contextlib.contextmanager | |
def smart_open(filename=None): | |
if filename and filename != '-': | |
fh = open(filename, 'w') | |
else: | |
fh = sys.stdout | |
try: | |
yield fh | |
finally: | |
if fh is not sys.stdout: | |
fh.close() | |
Use it like this: | |
# writes to some_file | |
with smart_open('some_file') as fh: | |
print >>fh, 'some output' | |
# writes to stdout | |
with smart_open() as fh: | |
print >>fh, 'some output' | |
# writes to stdout | |
with smart_open('-') as fh: | |
print >>fh, 'some output' | |
// | |
#opening a file and write something | |
f=open('text.txt','w'); | |
f.write("hello"); | |
f.close; | |
/ | |
#adding a newline: | |
f.write(your_string+"\n"); | |
/ | |
#append a text | |
f=open('text.txt','a'); | |
f.write("hello"); | |
f.close; | |
#create a dir | |
import os; | |
os.mkdir("newdir"); | |
#change into a dir | |
os.chdir("newdir"); | |
#split a string | |
>>> str="hello,bye" | |
>>> str.split(',') | |
>>> str | |
['hello', 'bye'] | |
#split string into elms using tab separators | |
elms=line.split("\t") | |
#replace text in a string | |
>>> s = '100 NORTH MAIN ROAD' | |
>>> s.replace('O','U') | |
'100 NURTH MAIN RUAD' | |
#regex | |
#1st ex:import re; | |
m = re.search("(\d+)","hello 12 bye caca") | |
if m: | |
print m.groups()[0] | |
>>>12 | |
#combining a patther and a variable | |
m1=re.search(r'>+%s' %variablename,line) | |
#finding all occurences of a pattern | |
>>> import re; | |
>>> p = re.compile('\d+'); | |
>>> p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping') | |
['12', '11', '10'] | |
#finding all declaring the pattern in the same line | |
>>> import re; | |
>>> re.findall('\d+','12 drummers drumming, 11 pipers piping, 10 lords a-leaping') | |
#checking if a string is empty | |
if not myString: | |
do something.... | |
#checking if a string starts with '>' | |
import re; | |
p = re.compile( '^>' ) | |
m = p.match( '>hello' ) | |
if m: | |
print 'Match found: ', m.group() | |
else: | |
print 'No match' | |
>>>Match found: > | |
Another simpler way: | |
m=re.search("^>",line) | |
if m: | |
print 'Match found: ', m.group() | |
#matchin more than one pattern on the same string | |
m=re.search("^>|^#",line) | |
if m: | |
print 'Match found: ', m.group() | |
#matching all non ACGT bases in a DNA sequence: | |
import re | |
dna = "ATCGCGAZZZTTCAA" | |
if re.search(r"[^ATGC]", dna): | |
print("restriction site found!") | |
// | |
#Counting longest occurrence of repeated sequence in Python | |
import re | |
my_str = "abcdefgfaabbbffbbbbbbfgbb" | |
length=len(max(re.compile("(b+b)*").findall(my_str))) | |
print(length) | |
/ | |
#negating a specific pattern | |
patt = re.compile('(?!_NON_REF)_GENOTYPE_CONCORDANCE' ) | |
// | |
#exit from a program | |
sys.exit(0) | |
#accessing the command line args | |
import sys | |
print 'Number of arguments:', len(sys.argv), 'arguments.' | |
print sys.argv[0] #print script name | |
print sys.argv[1] #printing first arg | |
#exiting if number of args is incorrect | |
if (len(sys.argv)<3): | |
sys.exit("[USAGE] python test_1.py <gff> <gene_id>") | |
// | |
#most efficient way of parsing command line args: | |
Good tutorial at: | |
https://mkaz.tech/python-argparse-cookbook.html | |
/ | |
#First example using a single verbose | |
import sys | |
import argparse | |
parser = argparse.ArgumentParser(description='Demo') | |
parser.add_argument('--verbose', | |
action='store_true', | |
help='verbose flag' ) | |
args = parser.parse_args() | |
if args.verbose: | |
print("~ Verbose!") | |
else: | |
print("~ Not so verbose") | |
#This is run by: | |
$ python test.py | |
~ Not so verbose | |
$ python test.py --verbose | |
~ Verbose! | |
#if you run it with | |
python test.py --help | |
#you will get: | |
usage: generate_expPer_donor.py [-h] [--verbose] | |
Demo | |
optional arguments: | |
-h, --help show this help message and exit | |
--verbose verbose flag | |
/ | |
#required arg: | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--limit', required=True, type=int) | |
args = parser.parse_args() | |
/ | |
#Parsing a file argument: | |
parser = argparse.ArgumentParser() | |
parser.add_argument('f', type=argparse.FileType('r')) | |
args = parser.parse_args() | |
for line in args.f: | |
print( line.strip() ) | |
// | |
#validating arguments | |
def check_positive(value): | |
ivalue = int(value) | |
if ivalue <= 0: | |
raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value) | |
return ivalue | |
parser = argparse.ArgumentParser(...) | |
parser.add_argument('foo', type=check_positive) | |
// | |
#adding a default value for an option: | |
parser.add_argument('--limit', default=5, type=int) | |
// | |
#removing \n in python: | |
'test string \n'.rstrip('\n') | |
or | |
line=line.rstrip('\n') | |
// | |
#using a list as an option for argparse: | |
parser.add_argument('-l','--list', nargs='+', help='<Required> Set flag', required=True) | |
# Use like: | |
# python arg.py -l 1234 2345 3456 4567 | |
# | |
TL;DR | |
Use the nargs option or the 'append' setting of the action option (depending on how you want the user interface to behave). | |
nargs | |
parser.add_argument('-l','--list', nargs='+', help='<Required> Set flag', required=True) | |
# Use like: | |
# python arg.py -l 1234 2345 3456 4567 | |
nargs='+' takes 1 or more arguments, nargs='*' takes zero or more. | |
// | |
#OOP in Python | |
// | |
A Python file is called a "module" and it's one way to organize your software so that it makes "sense". Another is a directory, called a "package". | |
A module is a distinct thing that may have one or two dozen closely-related classes. The trick is that a module is something you'll import, and you need that import to be perfectly sensible to people who will read, maintain and extend your software. | |
The rule is this: a module is the unit of reuse. | |
// | |
#!/usr/bin/python | |
class Employee: | |
'Common base class for all employees' | |
empCount = 0 | |
def __init__(self, name, salary): | |
self.name = name | |
self.salary = salary | |
Employee.empCount += 1 | |
def displayCount(self): | |
print "Total Employee %d" % Employee.empCount | |
def displayEmployee(self): | |
print "Name : ", self.name, ", Salary: ", self.salary | |
"This would create first object of Employee class" | |
emp1 = Employee("Zara", 2000) | |
"This would create second object of Employee class" | |
emp2 = Employee("Manni", 5000) | |
emp1.displayEmployee() | |
emp2.displayEmployee() | |
print "Total Employee %d" % Employee.empCount | |
/ | |
#Inheritance: | |
class MinimumBalanceAccount(BankAccount): | |
def __init__(self, minimum_balance): | |
BankAccount.__init__(self) | |
self.minimum_balance = minimum_balance | |
def withdraw(self, amount): | |
if self.balance - amount < self.minimum_balance: | |
print 'Sorry, minimum balance must be maintained.' | |
else: | |
BankAccount.withdraw(self, amount) | |
#Where BankAccount is the parent class and withdraw overrides the | |
withdraw method in the parent class | |
/ | |
*Another example of inheritance: | |
class Person: | |
def __init__(self, first, last): | |
self.firstname = first | |
self.lastname = last | |
def __str__(self): | |
return self.firstname + " " + self.lastname | |
class Employee(Person): | |
def __init__(self, first, last, staffnum): | |
super().__init__(first, last) | |
self.staffnumber = staffnum | |
x = Person("Marge", "Simpson") | |
y = Employee("Homer", "Simpson", "1007") | |
print(x) | |
print(y) | |
/ | |
#class built-in methods | |
__doc__ #class documentation, for examples; | |
print Employee.__doc__ | |
/ | |
#inheritance | |
#!/usr/bin/python | |
class Parent: # define parent class | |
parentAttr = 100 | |
def __init__(self): | |
print "Calling parent constructor" | |
def parentMethod(self): | |
print 'Calling parent method' | |
def setAttr(self, attr): | |
Parent.parentAttr = attr | |
def getAttr(self): | |
print "Parent attribute :", Parent.parentAttr | |
class Child(Parent): # define child class | |
def __init__(self): | |
print "Calling child constructor" | |
def childMethod(self): | |
print 'Calling child method' | |
/ | |
*Set attribute: | |
setattr(x, attr, 'magic') | |
/ | |
*Initializing an object from a dict: | |
for k,v in c.items(): | |
setattr(self,k,v) | |
print "h" | |
/ | |
# Private methods and attributes in Python: | |
# The information below is extracted from https://www.bogotobogo.com/python/python_private_attributes_methods.php | |
# | |
# Example of a class: | |
# p.py | |
class P: | |
def __init__(self, name, alias): | |
self.name = name # public | |
self.__alias = alias # private | |
def who(self): | |
print('name : ', self.name) | |
print('alias : ', self.__alias) | |
# When we create an instance of P and we try to access its attributes: | |
>>> from p import P | |
>>> x = P(name='Alex', alias='amen') | |
>>> x.name | |
'Alex' | |
>>> x.alias | |
Traceback (most recent call last): | |
File "", line 1, in | |
AttributeError: P instance has no attribute 'alias' | |
# We can't access alias using double underscore: | |
>>> x.__alias | |
Traceback (most recent call last): | |
File "", line 1, in | |
AttributeError: P instance has no attribute '__alias' | |
# But we can access by using a single underscore: | |
>>> x._P__alias | |
'amen' | |
# We can also have Private functions. An example of a class: | |
# p2.py | |
class P: | |
def __init__(self, name, alias): | |
self.name = name # public | |
self.__alias = alias # private | |
def who(self): | |
print('name : ', self.name) | |
print('alias : ', self.__alias) | |
def __foo(self): # private method | |
print('This is private method') | |
def foo(self): # public method | |
print('This is public method') | |
self.__foo() | |
# We can instantiate the class, but when we try to access the | |
# private function: | |
>>> from p2 import P | |
>>> x = P('Alex', 'amem') | |
>>> x.__foo() | |
Traceback (most recent call last): | |
File "", line 1, in | |
AttributeError: P instance has no attribute '__foo' | |
# But we can access this function by doing: | |
>>> x._P__foo() | |
This is private method | |
/ | |
#debugging in python | |
# epdb1.py -- experiment with the Python debugger, pdb | |
import pdb | |
a = "aaa" | |
pdb.set_trace() | |
b = "bbb" | |
c = "ccc" | |
final = a + b + c | |
print final | |
Then, run the script and it will stop when it reaches the | |
pdb.set_trace() line. | |
After that press 'n+enter' to advance, | |
Also use 'p variable' to print the variable | |
Use 'l' to list where you are | |
Use b 48 (to set a breakpoint at line 48) | |
Use c (go to the next breakpoint) | |
cl or clear #to clear all breakpoints | |
// | |
pretty printing a data structure in python (similar to Data::Dumper) | |
>>> import pprint | |
>>> stuff = ['spam', 'eggs', 'lumberjack', 'knights', 'ni'] | |
>>> stuff.insert(0, stuff[:]) | |
>>> pp = pprint.PrettyPrinter(indent=4) | |
>>> pp.pprint(stuff) | |
[ ['spam', 'eggs', 'lumberjack', 'knights', 'ni'], | |
'spam', | |
'eggs', | |
'lumberjack', | |
'knights', | |
'ni'] | |
// | |
#Object instrospection, using a customized way of printing the object: | |
>>> class Test: | |
... def __repr__(self): | |
... return "Test()" | |
... def __str__(self): | |
... return "member of Test" | |
... | |
>>> t = Test() | |
>>> t | |
Test() | |
>>> print t | |
member of Test | |
/ One example of its real use: | |
class Test: | |
def __init__(self, a, b): | |
self.a = a | |
self.b = b | |
def __repr__(self): | |
return "<Test a:%s b:%s>" % (self.a, self.b) | |
def __str__(self): | |
return "From str method of Test: a is %s, b is %s" % (self.a, self.b) | |
// | |
*print all the attributes of an object | |
def __str__(self): | |
sb = [] | |
for key in self.__dict__: | |
sb.append("{key}='{value}'".format(key=key, value=self.__dict__[key])) | |
return ', '.join(sb) | |
def __repr__(self): | |
return self.__str__() | |
// | |
#printing contents of a Python dict within a debugger: | |
from pprint import pprint | |
pprint (vars(your_object)) | |
// | |
#escaping in python: | |
print(r"\t\n") | |
#r stands for raw, this statement would literally print: | |
\t\n | |
#count characters within a string | |
dna="AAAAGGGGG" | |
dna.count("A") | |
dna.count("G") | |
#representation of an object | |
repr(objectname) | |
// | |
Initializing 2 variables at the same time | |
v1,v2=1,2 | |
// | |
# # | |
#debugger# | |
# # | |
python -m pdb script.py #run python debugger | |
p variable #print variable contents | |
n #next statement | |
b 2 #create a breakpoint at line 2 | |
/ | |
if after entering a command we press enter, we repeat this last command | |
/ | |
l #lists the area of my program that is currently being executed | |
/ | |
s #step into a subroutine | |
/ | |
q #exit from the debugger | |
/ | |
c #continue until the next breakpoint is hitted | |
// | |
print 'hello' | |
// | |
>>>print 1,2 #print adding a space between | |
1 2 | |
// | |
print 1,2, #print without newline at end of text | |
// | |
>>>a="ccc" | |
>>>print a.upper() | |
CCC | |
// | |
L1=[2,3,4] #create list (array) | |
print L1[0] #printing element at index=0 | |
// | |
L1[-2] #accessing element but counting from the right | |
// | |
L1[2:] #slicing a list (from index at 2 until the last index) | |
// | |
values = [100, 200, 300, 400, 500] | |
# Slice from third index to index one from last. | |
slice = values[2:-1] | |
print(slice) | |
Output | |
[300, 400] | |
// | |
# different examples of slicing: | |
# Let us first create a list to demonstrate slicing | |
# lst contains all number from 1 to 10 | |
lst = range(1, 11) | |
print lst | |
# we get: | |
range(1, 11) | |
That can be unpacked to: | |
lst = [*range(1, 11)] | |
# below list has numbers from 2 to 5 | |
lst1_5 = lst[1 : 5] | |
print lst1_5 | |
# below list has numbers from 6 to 8 | |
lst5_8 = lst[5 : 8] | |
print lst5_8 | |
# below list has numbers from 2 to 10 | |
lst1_ = lst[1 : ] | |
print lst1_ | |
# below list has numbers from 1 to 5 | |
lst_5 = lst[: 5] | |
print lst_5 | |
# below list has numbers from 2 to 8 in step 2 | |
lst1_8_2 = lst[1 : 8 : 2] | |
print lst1_8_2 | |
# below list has numbers from 10 to 1 | |
lst_rev = lst[ : : -1] | |
print lst_rev | |
# below list has numbers from 10 to 6 in step 2 | |
lst_rev_9_5_2 = lst[9 : 4 : -2] | |
print lst_rev_9_5_2 | |
// | |
# get the last elements of a list in python | |
>>> a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] | |
>>> a | |
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] | |
>>> a[-9:] | |
[4, 5, 6, 7, 8, 9, 10, 11, 12] | |
// | |
L1=["c","a","c","a"] #create list with strings | |
// | |
enumerate function, how it works: | |
seasons = ['Spring', 'Summer', 'Fall', 'Winter'] | |
for i, item in enumerate(L): | |
... print i | |
... print item | |
0 | |
Spring | |
1 | |
Summer | |
2 | |
Fall | |
3 | |
Winter | |
// | |
#split a string into characters | |
>>>print list("hello") | |
['h', 'e', 'l', 'l', 'o'] | |
// | |
#indexing strings | |
>>>h="hola" | |
>>>print h[0] | |
h | |
or | |
>>>print h[0:2] | |
ho | |
// | |
a="hola" | |
print len(a) #print out length of a | |
// | |
#concatenate | |
a="cac" | |
b="otas" | |
print a+b | |
// | |
>>>a="1" | |
>>>print a.isdigit() | |
True | |
\ | |
>>> b="ahola" | |
>>> print b.isdigit() | |
False | |
isdigit() returns True if all characters in a string are digits | |
// | |
#print a list as a string | |
>>>L1=["c","a","c","a"] | |
>>>print','.join(L1) | |
c,a,c,a | |
// | |
#join in python 3 | |
>>> ".".join(("a","b","c")) | |
'a.b.c' | |
// | |
#join in for loop: | |
url = 'http://ergast.com/api/f1/{0}/qualifying?limit=10000' | |
print('\n'.join(url.format(year) for year in range(2000, 2016))) | |
# http://ergast.com/api/f1/2000/qualifying?limit=10000 | |
# http://ergast.com/api/f1/2001/qualifying?limit=10000 | |
# ... | |
# http://ergast.com/api/f1/2015/qualifying?limit=10000 | |
// | |
#print argv passed to the script (sys.argv is a list) | |
import sys | |
print sys.argv | |
print len(sys.argv) #length of the list | |
passedFile=sys.argv[1] | |
// | |
print 3 in [1,2,3] #check membership | |
'TRUE' | |
// | |
for x in [1,2,3]: print x #iterate list | |
// | |
#to iterate over a certain range(from,to,interval) | |
>>>for i in range(0,3,1): | |
>>> print i | |
// | |
Generate list of range tuples with given boundaries in python | |
ranges = [(n, min(n+step, stop)) for n in xrange(start, stop, step)] | |
Will produce something like: | |
[(100, 110), (110, 120), (120, 130), (130, 140)] | |
// | |
#iterave over a list | |
>>>list=[1,2,3,4,5] | |
>>>for i in range(len(list)): | |
... print list[i] | |
1 2 3 4 5 | |
// | |
#append an elements into a list | |
list=[1,2,3] | |
list.append(4) | |
// | |
#append several elements onto list | |
list=[1,2,3] | |
list.extend([4,5,6]) | |
// | |
#delete and print last element(item) | |
list=[1,2,3] | |
print list.pop() | |
// | |
#getting the last element of a list: | |
some_list[-1] | |
// | |
#reverse a list | |
# See methods at https://www.geeksforgeeks.org/python-reversing-list/ | |
list.reverse() | |
// | |
#initializing an empty dict | |
newdict=dict() | |
// | |
#create dictionary (hash) | |
d1={'spam':2,'ham':1,'eggs':3} | |
// | |
#create dict from 2 lists: | |
keys = ['a','b','c','d'] | |
values = [1,2,3,4] | |
d = dict(zip(keys, values)) | |
// | |
#fetch value by key | |
d1['eggs'] | |
// | |
#key membership test | |
>>> dict={'a':1,'b':2,'c':3} | |
>>> 'a' in dict | |
True | |
>>> 'd' in dict | |
False | |
>>> if not 'f' in D: | |
print('missing') | |
// | |
#get the value for a specific key if it exists, if not it will assign | |
a default value: | |
>>> dict={'a':1,'b':2,'c':3} | |
>>> value=dict.get('c',0) #value will be 3 | |
>>> value=dict.get('d',0) #value will be 0 because 'd' does not exist | |
// | |
#check if key exists in a 2 nested dictionary | |
d.get('key1',{}).get('key2') | |
// | |
#returns list with keys | |
>>> d1.keys() | |
['eggs', 'ham', 'spam'] | |
// | |
#iterating through key/values simultaneously | |
>>>for k,v in d1.items(): | |
>>> print k,v | |
// | |
#another example of the use of a dict to organize the cities by state | |
>>> from collections import defaultdict | |
>>> city_list = [('TX','Austin'), ('TX','Houston'), ('NY','Albany'), ('NY', 'Syracuse'), ('NY', 'Buffalo'), ('NY', 'Rochester'), ('TX', 'Dallas'), ('CA','Sacramento'), ('CA', 'Palo Alto'), ('GA', 'Atlanta')] | |
>>> | |
>>> cities_by_state = defaultdict(list) | |
>>> for state, city in city_list: | |
... cities_by_state[state].append(city) | |
... | |
for state, cities in cities_by_state.iteritems(): | |
... print state, ', '.join(cities) | |
... | |
NY Albany, Syracuse, Buffalo, Rochester | |
CA Sacramento, Palo Alto | |
GA Atlanta | |
TX Austin, Houston, Dallas | |
// | |
# better way of performing autovivification | |
class AutoVivification(dict): | |
"""Implementation of perl's autovivification feature.""" | |
def __getitem__(self, item): | |
try: | |
return dict.__getitem__(self, item) | |
except KeyError: | |
value = self[item] = type(self)() | |
return value | |
a = AutoVivification() | |
a[1][2][3] = 4 | |
a[1][3][3] = 5 | |
a[1][2]['test'] = 6 | |
print(a) | |
// | |
#creating a nested dictionary defining a priory the number of levels | |
and the final value: | |
from collections import defaultdict | |
def autovivify(levels=1, final=dict): | |
return (defaultdict(final) if levels < 2 else | |
defaultdict(lambda: autovivify(levels - 1, final))) | |
words = autovivify(3, list) | |
words["sam"][2012][5].append('a') | |
words["sam"][2012][5].append('b') | |
#or: | |
words = autovivify(5, int) | |
words["sam"][2012][5][25]["hello"] += 1 | |
words["sue"][2012][5][24]["today"] += 1 | |
// | |
#create nested dictionary(dictionary of dictionary) | |
>>>d1={} | |
>>>d1["key1"]={ | |
>>> "keyA":0, | |
>>> "keyB":1, | |
>>> "keyC":2, | |
>>> } | |
// | |
#checking for the existence of a key in a dictionary: | |
if key in d: | |
d[key] += 1 | |
else: | |
d[key] = 1 | |
// | |
#print out results | |
>>>for key in d1.keys(): | |
>>> for key1 in d1[key]: | |
>>> print key1 | |
keyC | |
keyB | |
keyA | |
// | |
#accessing a nested dictionary | |
>>>print d1["key1"]["keyA"] | |
0 | |
// | |
#autovivification in python | |
def rec_dd(): | |
return defaultdict(rec_dd) | |
>>> x = rec_dd() | |
>>> x['a']['b']['c']['d'] | |
defaultdict(<function rec_dd at 0x7f0dcef81500>, {}) | |
// | |
dict.items() #This method returns a list of tuple pairs. | |
dict = {'Name': 'Zara', 'Age': 7} | |
print "Value : %s" % dict.items() | |
Value : [('Age', 7), ('Name', 'Zara')] | |
// | |
#deleteing a key from dict: | |
>>> dict={'a':1, | |
... 'b':2, | |
... 'c':3} | |
>>> del dict['a'] | |
>>> dict | |
{'c': 3, 'b': 2} | |
// | |
#creating a dictionary from list with indices as keys | |
a = [51,27,13,56] | |
b = dict(enumerate(a)) | |
print(b) | |
will produce: | |
{0: 51, 1: 27, 2: 13, 3: 56} | |
// | |
#defaultdict. A defaultdict works exactly like a normal dict, but it is initialized with a function (“default factory”) that takes no arguments and provides the default value for a nonexistent key. | |
>>> from collections import defaultdict | |
>>> ice_cream = defaultdict(lambda: 'Vanilla') | |
>>> | |
>>> ice_cream = defaultdict(lambda: 'Vanilla') | |
>>> ice_cream['Sarah'] = 'Chunky Monkey' | |
>>> ice_cream['Abdul'] = 'Butter Pecan' | |
>>> print ice_cream['Sarah'] | |
Chunky Monkey | |
>>> print ice_cream['Joe'] | |
Vanilla | |
>>> | |
/ | |
#Implementing a counter with defaultdict | |
from collections import defaultdict | |
string="a a b b b a a a b b b" | |
letters=string.split() | |
letter_count=defaultdict(int); # default value of int is 0 | |
for letter in letters: | |
letter_count[letter] +=1 # increment element's value by 1 | |
// | |
# is used for comments | |
// | |
v1="hola" | |
v2="caracola" | |
// | |
#initializing 2 variables at the same time | |
v1,v2=1,2 | |
// | |
# # | |
#debugger# | |
# # | |
pdb script.py #run python debugger | |
p variable #print variable contents | |
n #next statement | |
b 2 #create a breakpoint at line 2 | |
run #restart the debugger | |
// | |
#creating a breakpoint within a function that is in a class: | |
b VcfQC.run_CollectVariantCallingMetrics #it has to be set after being imported | |
// | |
print 'hello' | |
// | |
>>>print 1,2 #print adding a space between | |
1 2 | |
// | |
print 1,2, #print without newline at end of text | |
// | |
>>>a="ccc" | |
>>>print a.upper() | |
CCC | |
// | |
L1=[2,3,4] #create list (array) | |
print L1[0] #printing element at index=0 | |
// | |
L1[-2] #accessing element but counting from the right | |
// | |
L1[2:] #slicing a list (from index at 2 until the last index) | |
// | |
L1=["c","a","c","a"] #create list with strings | |
// | |
#indexing strings | |
>>>h="hola" | |
>>>print h[0] | |
h | |
or | |
>>>print h[0:2] | |
ho | |
// | |
a="hola" | |
print len(a) #print out length of a | |
// | |
#concatenate | |
a="cac" | |
b="otas" | |
print a+b | |
// | |
#string+int concatenation | |
>>>a=1 | |
>>>b="hola" | |
>>>c=str(a)+b | |
1hola | |
#a second way of doing this concatenation would be to use backticks | |
>>>a=1 | |
>>>b="hola" | |
>>>c=`a`+b | |
// | |
#print a string and an int | |
>>>a=1 | |
>>>print "caramelos=",a | |
// | |
convert string into integer | |
start="1000" | |
end=str(start)+1 | |
// | |
>>>L1=["c","a","c","a"] | |
>>>print','.join(L1) | |
c,a,c,a | |
// | |
#to replace characters in a string | |
newstringObject=stringObject.replace(old,new) | |
// | |
#removing whitespaces from a string | |
>>> s=" I am learning\tpython" | |
>>> s.replace(' ','') | |
'Iamlearning\tpython' | |
// | |
#removing all whitespaces and replacing by single /t | |
>>> s="hola adios" | |
>>> re.sub('\s+','\t',s) | |
'hola\tadios' | |
// | |
#using regex when replacing | |
>>>s='100 NORTH 100' | |
>>>re.sub('^100','200',s) | |
>>>re | |
>>>'200 NORTH 100' | |
// | |
#delete a fragment of a string | |
import re | |
url = 'abcdc.com' | |
url = re.sub('\.com$', '', url) | |
// | |
#Referencing the stdout: | |
import sys | |
sys.stdout | |
// | |
#print argv passed to the script | |
import sys | |
print sys.argv | |
// | |
print 3 in [1,2,3] #check membership | |
'TRUE' | |
// | |
for x in [1,2,3]: print x #iterate list | |
// | |
#to iterate over a certain range(from,to,interval) | |
>>>for i in range(0,3,1): | |
>>> print i | |
// | |
#iterave over a list | |
>>>list=[1,2,3,4,5] | |
>>>for i in range(len(list)): | |
... print list[i] | |
1 2 3 4 5 | |
// | |
# append integer to the beginning of a list | |
>>> a = 5 | |
>>> li = [1, 2, 3] | |
>>> [a] + li # Don't use 'list' as variable name. | |
[5, 1, 2, 3] | |
// | |
#append an elements into a list | |
list=[1,2,3] | |
list.append(4) | |
// | |
#append several elements onto list | |
list=[1,2,3] | |
list.extend([4,5,6]) | |
// | |
#delete and print last element(item) | |
>>>list=[1,2,3] | |
>>>print list.pop() | |
3 | |
// | |
#delement an element in the list | |
>>>list=[1,2,3] | |
>>>del list[0] | |
>>>print list | |
2,3 | |
// | |
#reverse a list | |
list.reverse() | |
// | |
#using lists as stacks | |
#a stack is a data structure where the last element added is the first element retrieved("last-in,first-out"): | |
>>>stack=[3,4,5] | |
>>>stack.append(6) | |
>>>stack.append(7) | |
>>>>>> stack | |
[3, 4, 5, 6, 7] | |
>>> stack.pop() | |
7 | |
>>> stack | |
[3, 4, 5, 6] | |
>>> stack.pop() | |
6 | |
>>> stack.pop() | |
5 | |
>>> stack | |
[3, 4] | |
// | |
#Using Lists as Queues | |
You can also use a list conveniently as a queue, where the first element added is the first element retrieved (“first-in, first-out”). To add an item to the back of the queue, use append(). To retrieve an item from the front of the queue, use pop() with 0 as the index. For example: | |
>>> queue = ["Eric", "John", "Michael"] | |
>>> queue.append("Terry") # Terry arrives | |
>>> queue.append("Graham") # Graham arrives | |
>>> queue.pop(0) | |
'Eric' | |
>>> queue.pop(0) | |
'John' | |
>>> queue | |
['Michael', 'Terry', 'Graham'] | |
// | |
#create dictionary (hash) | |
d1={'spam':2,'ham':1,'eggs':3} | |
// | |
#fetch value by key | |
d1['eggs'] | |
// | |
#key membership test | |
>>> d1.has_key('ham') | |
True | |
// | |
#returns list with keys | |
>>> d1.keys() | |
['eggs', 'ham', 'spam'] | |
// | |
#iterating through key/values simultaneously | |
>>>for k,v in d1.items(): | |
>>> print k,v | |
// | |
#create nested dictionary(dictionary of dictionary) | |
>>>d1={} | |
>>>d1["key1"]={ | |
>>> "keyA":0, | |
>>> "keyB":1, | |
>>> "keyC":2, | |
>>> } | |
#print out results | |
>>>for key in d1.keys(): | |
>>> for key1 in d1[key]: | |
>>> print key1 | |
keyC | |
keyB | |
keyA | |
// | |
#iterate over this is sorted order by the key | |
>>> steps = {1:"val1", 5:"val2", 2:"val3"} | |
>>> for key in sorted(steps): | |
... print steps[key] | |
... | |
val1 | |
val3 | |
val2 | |
// | |
#accessing a nested dictionary | |
>>>print d1["key1"]["keyA"] | |
0 | |
// | |
#sorting a dictionary by its keys | |
def sortedDictValues(adict): | |
keys = adict.keys( ) | |
keys.sort( ) | |
return [adict[key] for key in keys] | |
// | |
#In Python2.7, an OrderedDict can be used to sort a dictionary by its | |
keys: | |
sd = OrderedDict(sorted(d.items())) #where d is the dictionary to sort | |
// | |
#looping through a sorted dict: | |
python_words = {'list': 'A collection of values that are not connected, but have an order.', | |
'dictionary': 'A collection of key-value pairs.', | |
'function': 'A named set of instructions that defines a set of actions in Python.', | |
} | |
for word in sorted(python_words.keys()): | |
print(word) | |
// | |
#sorting a dictionary by its values | |
>>>d1={ | |
>>> 'a':2, | |
>>> 'b':4, | |
>>> 'c':3, | |
>>> 'd':1 | |
>>> } | |
>>> | |
>>>sortedKeys=sorted(d1.items(), key=lambda(k,v):(v,k)) | |
>>> | |
>>>for thisKey in sortedKeys: | |
>>> print thisKey[0],d1[thisKey[0]] | |
d 1 | |
a 2 | |
c 3 | |
b 4 | |
// | |
#counting characters in a string | |
>>>string="aaabbb" | |
>>>charCount={} | |
>>>for char in string: | |
>>> charCount[char]=charCount.get(char,0)+1 | |
>>>print charCount | |
{'a': 3, 'b': 3} | |
// | |
myfile=open('myfile','w') #open for output (creates) | |
myfile.write('hello text file\n') #write a line of text | |
myfile.close() | |
// | |
>>> myfile=open('myfile','r') #open for input | |
>>> myfile.readline() #read the line back | |
'hello text file\n' | |
// | |
#open a file and throws an error if file does not exist | |
file="filename" | |
#check if file exists | |
if os.path.isfile(file) == False: | |
raise Exception("File does not exist") | |
#getting the dir for a certain file: | |
dir=os.path.dirname(os.path.abspath(file)) | |
#check if dir exists | |
import os | |
os.path.isdir('./dir') | |
// | |
#better way of opening a file (it closes the file when there is an error) | |
#using with | |
with open("x.txt") as f: | |
data = f.read() | |
do something with data | |
/ | |
#or reading line per line | |
with open("x.txt") as f: | |
for line in f: | |
print line, | |
// | |
#os module | |
// | |
*get the file size of a certain file | |
import os | |
os.path.getsize('C:\\Python27\\Lib\\genericpath.py') | |
// | |
#getting the current working directory | |
>>>import os | |
>>> print os.getcwd() | |
/data/scratch/ernesto/454/SCD1/SNPs | |
// | |
#return all files in a directory as a list | |
>>>os.listdir(os.getcwd()) | |
or | |
>>>os.listdir('.') | |
// | |
#getting extension of a file | |
>>>import os | |
>>> os.path.splitext('caca.txt') | |
('caca', '.txt') | |
// | |
>>> filename='/Users/ernesto/projects/IGSR/files/testABC.pdf' | |
22>>> os.path.basename(filename).split('.')[0] | |
'testABC' | |
// | |
#splitting an absolute path in /path/ and file name | |
>>>import os | |
>>> os.path.split('/data/genomes/human36/chr1.fa') | |
('/data/genomes/human36/','chr1.fa') | |
// | |
#globbing or reading files from a directory | |
import glob | |
glob.glob("/data/scratch/ernesto/454/SCD1/againstGenes/snps/*txt") | |
// | |
#globbing | |
import glob | |
for file in glob.glob("*txt"): | |
print file | |
/ | |
# globbing and filtering the resulting files based on a pattern: | |
import glob | |
res = [f for f in glob.glob("*.txt") if "abc" in f or "123" in f or "a1b" in f] | |
for f in res: | |
print(f) | |
// | |
#delete a file | |
os.remove('afile') | |
// | |
#renaming a file | |
os.rename(filename,newfilename) | |
// | |
#if statement | |
>>>x=2; | |
>>>if x==1: | |
>>> print "hello" | |
>>>elif x==2: | |
>>> print "caca" | |
>>>else: | |
>>> print "cacotas" | |
// | |
#negating ifs | |
if not x==1: | |
print "hello" | |
// | |
#while loop | |
>>>a=0; b=10 | |
>>> | |
>>>while a<b: | |
>>> print a, | |
>>> a+=1 | |
0 1 2 3 4 5 6 7 8 9 | |
// | |
*args and **keywordargs for functions definitions are used for passing lists of arguments and dictionaries of arguments, respectively. So if I had a function this: | |
def printlist(*args): | |
for x in args: | |
print x | |
printlist(1, 2, 3, 4, 5) # or as many more arguments as I'd like | |
def printdict(**kwargs): | |
print repr(kwargs) | |
printdict(john=10, jill=12, david=15) | |
// | |
#using a default value for an argument that can be passed through the | |
**kwargs dictionary | |
def printdict(a,b=5,**kwargs): | |
print("b="+str(b)) | |
printdict(a=1,john=10, d=12, david=15) | |
#will print b=5, but if b is passed, then the new value of b will be printed | |
// | |
#passing a dictionary to a function having **kwargs | |
def printdict(a,b=3,**kwargs): | |
print("a="+str(a)) | |
print("b="+str(b)) | |
print repr(kwargs) | |
d={'john':1,'b':2,'david':3} | |
printdict(a=1,**d) | |
// | |
* setting class attributes dynamically with variable number of arguments and kwargs: | |
class Bar(object): | |
def __init__(self, **kwargs): | |
self.__dict__.update(kwargs) | |
bar = Bar(a=1, b=2) | |
print(bar.b) | |
/ | |
* And if you want to allow only a certain attributes: | |
class Bar(object): | |
def __init__(self, **kwargs): | |
allowed_keys = ['a', 'b', 'c'] | |
self.__dict__.update((k, v) for k, v in kwargs.items() if k in allowed_keys) | |
// | |
#passing a dictionary to a function | |
def my_function(city, standard, name, school=None): | |
schoolname = school | |
cityname = city | |
standard = standard | |
studentname = name | |
print(cityname) | |
data = {'standard': '7', 'name': 'abc', 'city': 'delhi'} | |
my_function(**data) | |
// | |
#create times function | |
>>>def times(x,y): | |
>>> return x*y | |
>>>product=times(2,4) | |
>>>print product | |
8 | |
// | |
#in python lists,dictionaries and tuples are basic types. So they can be passed as arguments for functions | |
// | |
#checking the type of an object | |
>>>a=1 | |
>>>type(a) | |
<type 'int'> | |
// | |
REGEX | |
\d matches any decimal digit, [0-9] | |
\D matches any non-digit character [^0-9] | |
\s matches any whitespace character [ \t\n\r\f\v] | |
\S matches any non-whitespace character [^ \t\n\r\f\v] | |
\w matches any alphanumeric character [a-zA-Z0-9_] | |
\W matches any non-alphanumeric character [^a-zA-Z0-9_] | |
// | |
Performing Matches: | |
match() #determine if the RE matches at the beginning of the string | |
search() #scan through a string, looking for any location where this RE matches | |
findall() #find all substrings where the RE matches, and returns them as a list | |
finditer() #find all substrings where the RE matches, and return them as an iterator | |
// | |
REGEX | |
>>>import re #module for regex | |
>>>p=re.comple('^>') #compile patternp | |
>>>print p.match("") #if match then returns a match object. if not then returns none | |
None | |
// | |
#open a file and splitting by newlines: | |
with open("out.txt") as f: | |
data = f.read().splitlines() | |
print "h\n" | |
// | |
#open a file a print out all lines starting with a pattern: | |
with open("test.fasta") as f: | |
for line in f: | |
if line.startswith(">"): | |
print line, | |
// | |
#grouping in the REGEX | |
>>>dna="chr14:10000-20000" | |
>>>p=re.compile("(chr\d+):(\d+)-(\d+)") | |
>>>m=p.match(dna) | |
>>>print m.group(0) #returns the whole match | |
>>>chr=m.group(1) | |
>>>start=m.group(2) | |
>>>end=m.group(3) | |
>>>print chr | |
>>>print start | |
>>>print end | |
chr14:10000-20000 | |
chr14 | |
10000 | |
20000 | |
// | |
#matching 5 as | |
a{5} | |
// | |
#parsing a file and skipping lines starting with a certain pattern | |
import re | |
p=re.compile('^#') #compile pattern | |
infile=open('filename','r') | |
while 1: | |
line=infile.readline() | |
if p.match(line):continue | |
// | |
#Create a List that contain each Line of a File | |
List = open("filename.txt").readlines() | |
// | |
#Structure of an exception: | |
try: | |
You do your operations here; | |
...................... | |
except ExceptionI: | |
If there is ExceptionI, then execute this block. | |
except ExceptionII: | |
If there is ExceptionII, then execute this block. | |
...................... | |
else: | |
If there is no exception then execute this block. | |
#Ex: | |
#!/usr/bin/python | |
try: | |
fh = open("testfile", "w") | |
fh.write("This is my test file for exception handling!!") | |
except IOError: | |
print "Error: can\'t find file or read data" | |
else: | |
print "Written content in the file successfully" | |
fh.close() | |
// | |
#List comprehensions: | |
Everything that can be expressed as a 'for' loop can be expressed with a list comprehension. For example (with pseudocode): | |
new_things = [] | |
for ITEM in old_things: | |
if condition_based_on(ITEM): | |
Can be expressed: | |
new_things = ["something with " + ITEM for ITEM in old_things if condition_based_on(ITEM)] | |
With a real example: | |
numbers = [1, 2, 3, 4, 5] | |
doubled_odds = [] | |
for n in numbers: | |
if n % 2 == 1: | |
doubled_odds.append(n * 2) | |
Can be transformed to: | |
numbers = [1, 2, 3, 4, 5] | |
doubled_odds = [n * 2 for n in numbers if n % 2 == 1] | |
#if the for loop does not have a condition. Then it is even simpler: | |
doubled_numbers = [] | |
for n in numbers: | |
doubled_numbers.append(n * 2) | |
Gets to: | |
doubled_numbers = [n * 2 for n in numbers] | |
#For nested loops (for example, if we want to flatten a matrix): | |
flattened = [] | |
for row in matrix: | |
for n in row: | |
flattened.append(n) | |
Then the comprehension would be: | |
flattened = [n for row in matrix for n in row] | |
// | |
*Using regex on the elements of a list: | |
*Good explanation at: | |
http://www.cademuir.eu/blog/2011/10/20/python-searching-for-a-string-within-a-list-list-comprehension/ | |
>>> import re | |
>>> list=['a cat','a dog','a yacht','cats'] | |
>>> regex=re.compile(".*(cat).*") | |
>>> [m.group(0) for l in list for m in [regex.search(l)] if m] | |
['a cat', 'cats'] | |
>>> [m.group(1) for l in list for m in [regex.search(l)] if m] | |
['cat', 'cat'] | |
// | |
# catch all exceptions | |
try: | |
... | |
except: | |
# catch just one exception | |
try: | |
... | |
except IOError: | |
... | |
# catch one exception, but provide the exception object | |
try: | |
... | |
except IOError, e: | |
... | |
# catch more than one exception | |
try: | |
... | |
except (IOError, ValueError), e: | |
... | |
// | |
#raising exceptions, allows the programmer to force a specified exception to occur. | |
>>>try: | |
>>> raise NameError('HiThere') | |
>>>except NameError: | |
>>> print 'An exception flew by!' | |
>>> raise | |
// | |
*TypeError exceptions: | |
: | |
*We are trying to access a string as it were a dict | |
a="abc" | |
try: | |
value = a['a'] | |
except TypeError,e: | |
print "Not valid",e | |
#e will contain more info in the error cause | |
// | |
>>>a="hola," | |
>>>print a.strip(",") | |
hola | |
// | |
#remove newline character in the right side | |
line=line.rstrip("\n") | |
// | |
#remove all spaces in the right side | |
line=line.rstrip() | |
// | |
#remove all spaces in any of the sides | |
line=line.strip() | |
// | |
while <test1>: | |
<statements1> | |
if<test2>: break #last equivalent | |
if<test3>: continue | |
if<test4>: pass | |
// | |
+= #concatenate operator | |
// | |
#or logical operator | |
if a=="a" or a=="b": | |
// | |
#and logical operator | |
if a=="a" and a=="b" | |
// | |
#increment operator | |
variable+=1 | |
// | |
#map, executes passed in function over each item in a list | |
>>>def inc(x):return x+10 | |
>>>L=map(inc,[1,2,3]) | |
>>>print L | |
// | |
#list comprenhension (shortcut to create lists from other lists by specifying a formula to be applied to each element) | |
>>>[x*x for x in [1,2,3]] | |
[1, 4, 9] | |
/ | |
*concat a string to all elements of a list using comprenhension | |
>>>['concat_'+x for x in ['a','b','c']] | |
['concat_a', 'concat_b', 'concat_c'] | |
/ | |
#another example: | |
pow2 = [2 ** x for x in range(10)] | |
// | |
#getting user input | |
name=(raw_input("Como te llamas?")) | |
// | |
#exceptions | |
// | |
#raising a IOError exception | |
try: | |
f=open('normoxia_70bp.aln.nativ','r') | |
except IOError, e: | |
print e | |
// | |
#casting to a int | |
a="1" | |
b=int(a) | |
// | |
#casting into a float | |
a="1.245" | |
b=float(a) | |
// | |
#string formatting | |
>>> exclamation="Ni" | |
>>> "The knights who say %s!" % exclamation | |
'The knights who say Ni!' | |
// | |
"%d %s %d you" % (1,'spam',4) | |
'1 spam 4 you' | |
// | |
*If one want to repeat the same string several times, instead of | |
doing: | |
s='arbit' | |
string='%s hello world %s hello world %s' %(s,s,s) | |
*It is better to use (available from Python 2.6 and Python 3.x:): | |
incoming = 'arbit' | |
result = '{0} hello world {0} hello world {0}'.format(incoming) | |
// | |
#formatting a float (setting the number of decimal places) | |
>>> a=1.23456789 | |
>>> '%.2f' % a | |
'1.23' | |
// | |
#Differences between %s and %r: | |
%s invokes str(), whereas %r invokes repr(). | |
Ex: | |
x = "example" | |
print "My %s"%x | |
My example | |
print "My %r"%x | |
My 'example' | |
// | |
#Another way, using string format method. | |
Printing out a table: | |
table={'hola': 1, 'adios':2, 'bye':3} | |
for name,phone in table.items(): | |
print '{0:10} ==> {1:40d}'.format(name,phone) | |
bye ==> 3 | |
hola ==> 1 | |
adios ==> 2 | |
10 and 40 controls the amount of spaces in each cell, | |
the d is because we are dealing with integers, if we would have | |
strings, we should use 40s | |
// | |
#Calling a function for each object within a list of objects (in this | |
case we invoke the mass function) | |
sumofmass = sum(i.mass for i in objList) | |
// | |
#exitting the script | |
sys.exit() | |
// | |
#making a python script executable | |
#!/usr/bin/python | |
print "KAKOTAS!!!" | |
#chmod +x prueba.py | |
// | |
#issue warnings to STDER | |
import warnings | |
warnings.warn("Hello") | |
// | |
#process management | |
>>>import subprocess | |
>>> | |
>>>subprocess.Popen(['/bin/echo','hola']) #open a process, args is a list in which list[0] is the command and list[>1] are the command arguments | |
#NOTE. If we open processes in a loop, this subprocess module allows | |
to open several processes concurrently. This functionality may crash | |
the server | |
/ | |
#gettng the stdout and stderr | |
from subprocess import Popen, PIPE | |
cmd = 'blah' | |
p = Popen(cmd, stdout=PIPE, stderr=PIPE) | |
stdout, stderr = p.communicate() | |
/ | |
#run ls and store its output | |
import subprocess | |
ls_output = subprocess.check_output(['ls']) | |
#to pass args, #the first elem is the command name and the rest are | |
the args | |
subprocess.check_output(['ls', '-l']) | |
# | |
#this will not work, the shell considers the entire string as | |
something to be executed | |
subprocess.call(['ls -l']) | |
# | |
#this would work: | |
subprocess.call('ls -l', shell=True) | |
// | |
returncode=subprocess.call("ls -l", shell=True) | |
#if returncode=0 then everything went well, if not then it will 1 | |
// | |
*running a process and accessing the error when something went wrong: | |
import subprocess | |
try: | |
subprocess.check_output("ls -l",shell=True) | |
except subprocess.CalledProcessError as e: | |
print(e.output) | |
// | |
#another way of opening subprocesses is os.system(). It is easier to use than subprocess but user do not have the same level of control over the subprocesses. | |
#If you have several arguments to be passed to the process, a List is not needed.The use of system is equivalent to Perl system() | |
#System only open one process at a time | |
>>>import os | |
>>>os.system("blat database file -minIdentity=100 -out=pslx | |
outputname.pslx") | |
// | |
#declaring and initializing several variables at a time | |
(a,b)=('a','b') | |
or | |
(a,b)=('','') | |
// | |
#sort a list | |
>>>array=[2,1,4,3,5,6] | |
>>>array.sort() | |
>>>print array | |
[1, 2, 3, 4, 5, 6] | |
// | |
#getting the index of a cetain element in an array | |
["foo", "bar", "baz"].index("bar") | |
1 | |
// | |
>>>4%2 #modulus | |
// | |
#calculating median | |
array=[2,1,4,3,5,6] | |
array.sort() | |
count=len(array) | |
median=0.0 | |
if count%2: | |
#odd | |
median=array[int(count/2)]; | |
else: | |
#even | |
lower=float(array[(count/2)-1]) | |
upper=float(array[count/2]) | |
median=(float(array[count/2])+float(array[count/2-1]))/2 | |
print median | |
// | |
#multiline printing | |
>>>print """ | |
>>>First line. | |
>>>Second line. | |
>>>Third line. | |
>>>""" | |
First line. | |
Second line. | |
Third line. | |
// | |
#installing python eggs: | |
sudo easy_install JsonUtils | |
or | |
pip install python_module_name #this will install it in | |
/homes/ernesto/.local/lib/python2.7/site-packages/ | |
/ | |
#knowing where pip installs the modules: | |
>>> import site; site.getsitepackages() | |
/ | |
#Changing the default install location for pip: | |
pip install --install-option="--prefix=/homes/ernesto/" packagename | |
/ | |
#knowing list of installed packages and versions: | |
pip list | |
/ | |
#If you have a program working in a given computer, then get a freeze | |
of all pip modules installed: | |
pip freeze > requeriments.txt | |
#then, install all modules required on a new computer using: | |
pip install -r requirements.txt | |
/ | |
#upgrade a given package: | |
pip install modulename --upgrade | |
/ | |
#uninstall a package: | |
pip uninstall packagename | |
/ | |
#installing easy_install locally: | |
wget https://bootstrap.pypa.io/ez_setup.py -O - | python - --user | |
#will install it at: | |
/homes/ernesto/.local/bin/ | |
/ | |
#using easy_install for installing in a given dir: | |
~/.local/bin/easy_install-2.7 --install-dir /homes/ernesto/.local/lib/python2.7/site-packages pip | |
/ | |
*If pip it is not installed, you will need to install it by doing: | |
sudo easy_install pip | |
// | |
#installing package from source | |
#use the option --record files.txt to know what files were created | |
python setup.py install --record files.txt --home=/nfs/research2/flicek/user/ernesto/ | |
// | |
#knowing the location of an installed package: | |
pip show packagename | |
// | |
#classes in Python | |
#!/usr/bin/python | |
class Employee: | |
'Common base class for all employees' | |
empCount = 0 | |
def __init__(self, name, salary): | |
self.name = name | |
self.salary = salary | |
Employee.empCount += 1 | |
def displayCount(self): | |
print "Total Employee %d" % Employee.empCount | |
def displayEmployee(self): | |
print "Name : ", self.name, ", Salary: ", self.salary | |
"This would create first object of Employee class" | |
emp1 = Employee("Zara", 2000) | |
"This would create second object of Employee class" | |
emp2 = Employee("Manni", 5000) | |
emp1.displayEmployee() | |
emp2.displayEmployee() | |
print "Total Employee %d" % Employee.empCount | |
// | |
#reading in a Json file | |
import json | |
from pprint import pprint | |
file=open('test.json','r') | |
parsed_json = json.load(file) | |
pprint(parsed_json) | |
// | |
*Creating your own module: | |
Read tutorial at: | |
http://www.tutorialspoint.com/python/python_modules.htm | |
// | |
*Creating your own package: | |
Read at tutorial at: | |
http://www.tutorialspoint.com/python/python_modules.htm | |
// | |
*The dir() function: | |
The dir() built-in function returns a sorted list of strings | |
containing the names defined by a module. | |
For example: | |
# Import built-in module math | |
import math | |
content = dir(math) | |
print content | |
// | |
*Parsing XML | |
One useful module is xmltodict. Read on it at: | |
http://docs.python-guide.org/en/latest/scenarios/xml/ | |
/ | |
#parsing and writing out a xml file: | |
with open('analysis.xml') as fd: | |
doc = xmltodict.parse(fd.read()) | |
doc['ANALYSIS_SET']['ANALYSIS']['ANALYSIS_TYPE']['REFERENCE_ALIGNMENT']['ASSEMBLY']['STANDARD']['@refname']="GRCh37" | |
print (xmltodict.unparse(doc,encoding='utf-8',pretty=True)) | |
#Problem is that is not respect the element order when unparsing | |
// | |
*Counter in Python: | |
>>> from collections import Counter | |
>>> Counter(['apple','red','apple','red','red','pear']) | |
Counter({'red': 3, 'apple': 2, 'pear': 1}) | |
// | |
#creating datetime objects: | |
>>> import datetime | |
>>> | |
>>> x = datetime.datetime(2020, 5, 17) | |
>>> print(x) | |
2020-05-17 00:00:00 | |
/ | |
#or : | |
>>> x = datetime.datetime(2020, 5, 17,22,30) | |
>>> print(x) | |
2020-05-17 22:30:00 | |
// | |
*Working with dates: | |
from datetime import datetime | |
now = datetime.now() | |
print "Now: ", now | |
print "Today's date: ", now.strftime('%Y-%m-%d') | |
print "year:", now.year | |
print "month:", now.month | |
print "day:", now.day | |
print "hour:", now.hour | |
print "minute:", now.minute | |
print "second:", now.second | |
/ | |
*Converting strings to dates: | |
from datetime import datetime | |
datetime_object = datetime.strptime('Jun 1 2005 1:33PM', '%b %d %Y | |
%I:%M%p') | |
candle = datetime.strptime("2015-10-24 21:10:05", "%Y-%m-%d %H:%M:%S") | |
/ | |
*datetime in isoformat | |
print datetime.isoformat() | |
2015-10-19T21:00:00 | |
/ | |
*Convert string into datetime | |
from datetime import datetime | |
date_object = datetime.strptime('Jun 1 2005 1:33PM', '%b %d %Y | |
%I:%M%p') | |
/ | |
#dates arithmetics: | |
import datetime | |
d=datetime.datetime.strptime("2016-10-26", "%Y-%m-%d").date() | |
one_day = datetime.timedelta(days=3) | |
yesterday = d - one_day | |
print yesterday | |
/ | |
* Getting hours, hours, days from timedelta | |
>>> import datetime | |
>>> x = datetime.datetime(2020, 5, 17,22,30) | |
>>> y = datetime.datetime(2020, 5, 16,22,30) | |
>>> d= x-y | |
>>> d.days | |
Returns 2 (days in this case) | |
If the timedelta is not a whole, then we will get | |
seconds that we will need to convert to hours | |
>>> x = datetime.datetime(2020, 5, 17,22,30) | |
>>> y = datetime.datetime(2020, 5, 16,23,00) | |
>>> d= x-y | |
>>> d.days | |
0 | |
>>> d.seconds | |
84600 | |
>>> d.seconds/3600 | |
In hours | |
/ | |
*another way of dealing with dates: | |
/ | |
*Adding single days: | |
import pandas as pd | |
ic="2016-12-11 22:00:00" | |
D=pd.DateOffset(1) | |
# pd.datetime is an alias for datetime.datetime | |
candle = pd.datetime.strptime(ic, "%Y-%m-%d %H:%M:%S") | |
start=candle-2*D | |
end=candle+2*D | |
// | |
*Knowing the weekday: | |
datetime.datetime.today().weekday() | |
// | |
*substracting 2 times (differences between 2 dates): | |
# Create datetime objects for each time (a and b) | |
dateTimeA = datetime.datetime.combine(datetime.date.today(), a) | |
dateTimeB = datetime.datetime.combine(datetime.date.today(), b) | |
# Get the difference between datetimes (as timedelta) | |
dateTimeDifference = dateTimeA - dateTimeB | |
# Divide difference in seconds by number of seconds in hour (3600) | |
dateTimeDifferenceInHours = dateTimeDifference.total_seconds() / 3600 | |
// | |
import pandas as pd | |
# BDay is business day, not birthday... | |
from pandas.tseries.offsets import BDay | |
# pd.datetime is an alias for datetime.datetime | |
today = pd.datetime.today() | |
print today - BDay(4) | |
/ | |
* creating times | |
from datetime import time | |
# time(hour = 0, minute = 0, second = 0) | |
a = time() | |
print("a =", a) | |
# time(hour, minute and second) | |
b = time(11, 34, 56) | |
print("b =", b) | |
# time(hour, minute and second) | |
c = time(hour = 11, minute = 34, second = 56) | |
print("c =", c) | |
# time(hour, minute, second, microsecond) | |
d = time(11, 34, 56, 234566) | |
print("d =", d) | |
/ | |
*Creating dates/times ranges: | |
*Hourly | |
import pandas as pd | |
ic="2016-12-11 22:00:00" | |
rng = pd.date_range('1/1/2011', periods=72, freq='H') | |
print rng[:5] | |
/ | |
*Iterating over time_ranges: | |
for d in pd.date_range(start='2016-12-09',end='2016-12-15'): | |
print d | |
// | |
*Initializing a time object with 0:00:00 | |
import datetime | |
t = datetime.time(0, 0, 0) * (hours,minutes,seconds) | |
// | |
*getting the current time and date: | |
now = datetime.datetime.now() | |
/ | |
#comparing 2 times: | |
import datetime | |
d.time()<datetime.time(22, 0, 0)) | |
// | |
#comparing 2 dates: | |
if object.date()==datetime.date(2016,12,26): | |
// | |
#comparing 2 dicts in python 3: | |
d_1 = {'peter': 1, 'adam': 2, 'david': 3} | |
d_3 = {'peter': 1, 'adam': 2, 'david': 5} | |
print(set(d_1.keys()) == set(d_3.keys())) | |
print(set(d_1.values()) == set(d_3.values())) | |
// | |
*Connecting to mysql from Python. | |
Good tutorial at: | |
https://www.tutorialspoint.com/python/python_database_access.htm | |
/ | |
#SELECT query: | |
import datetime | |
import mysql.connector | |
cnx = mysql.connector.connect(user='scott', database='employees') | |
cursor = cnx.cursor() | |
query = ("SELECT first_name, last_name, hire_date FROM employees " | |
"WHERE hire_date BETWEEN %s AND %s") | |
hire_start = datetime.date(1999, 1, 1) | |
hire_end = datetime.date(1999, 12, 31) | |
cursor.execute(query, (hire_start, hire_end)) | |
#to know how many results the query returned, use: | |
res_count=cursor.rowcount | |
for (first_name, last_name, hire_date) in cursor: | |
print("{}, {} was hired on {:%d %b %Y}".format( | |
last_name, first_name, hire_date)) | |
cursor.close() | |
cnx.close() | |
#getting the results from db: | |
data = cursor.fetchall() #returns an array of arrays | |
for row in data : | |
do stuff | |
/ | |
#another way (less typing): | |
other_ids = [row[0] for row in cursor.fetchall()] | |
/ | |
PyMySQL | |
/ | |
#getting results as a dictionary: | |
cursor = conn.cursor(MySQLdb.cursors.DictCursor) | |
cursor.execute("SELECT name, category FROM animal") | |
result_set = cursor.fetchall() | |
for row in result_set: | |
print "%s, %s" % (row["name"], row["category"]) | |
/ | |
#connect mysql: | |
db = | |
MySQLdb.connect(host="mysql-g1kdcc-public",user="g1krw",passwd=????,port=4197,db="g1k_archive_staging_track" | |
) | |
#SQL statement: | |
db = MySQLdb.connect(host=args.host,user=args.user,passwd=args.pwd,port=args.port,db=args.db ) | |
cursor = db.cursor() | |
sql_string="INSERT INTO file (file_id, name, md5,type,size,host_id,withdrawn,created) VALUES (NULL, '%s', '%s', '%s', %d,1,0,NOW())" % (path,md5sum,args.file_type,size) | |
try: | |
# Execute the SQL command | |
cursor.execute(sql_string) | |
# Commit your changes in the database | |
db.commit() | |
except MySQLdb.Error,e: | |
print e[0], e[1] | |
# Rollback in case there is any error | |
db.rollback() | |
* If you want to get the id of the last inserted row, use: | |
cursor.lastrowid() | |
* If exception, then print the actual query used: | |
cursor._last_executed | |
// | |
#parsing a sam file: | |
import pysam | |
samfile = pysam.AlignmentFile("ex1.bam", "rb") | |
for read in samfile.fetch('chr1', 100, 120): | |
print read | |
samfile.close() | |
// | |
#getting the size of a file | |
import os | |
os.path.getsize(path) | |
// | |
#assert function | |
test a condition, and trigger an error if the condition is false. | |
For example: | |
i=5 | |
assert i>2 | |
#will no raise anything, | |
But if we do: | |
i=1 | |
assert i>2 | |
We get: | |
Traceback (most recent call last): | |
File "test.py", line 3, in <module> | |
assert i>2 | |
AssertionError | |
// | |
#assert on a single line for testing a condition | |
>>> a=2 | |
>>> assert a==1, "a variable is not 1" | |
// | |
# assert a datetime: | |
assert slist.start() == datetime.datetime(2019, 3, 10, 21, 0) | |
// | |
#knowing the environment within a function an outside the function: | |
a_global="hole" | |
def foo(): | |
print locals() | |
print foo() | |
print globals() | |
#And the output you get: | |
{} | |
None | |
{'a_global': 'hole', '__builtins__': <module '__builtin__' | |
(built-in)>, '__file__': 'test.py', '__package__': None, '__name__': | |
'__main__', 'foo': <function foo at 0x10e60a320>, '__doc__': None} | |
Whichs that the local environment within foo() is empty and the | |
global environment contains the variable declared that is named | |
'a_global' | |
// | |
#modifying the contents of PATH to a certain value | |
dict(os.environ,PATH="/homes/ernesto/lib") | |
// | |
#passing the desired PYTHONPATH at runtime: | |
PYTHONPATH=/path/to/ python script.py | |
// | |
*duck typing: | |
he idea is that it doesn't actually matter what type my data is - just | |
whether or not I can do what I want with it. | |
// | |
*In Python, everything is an object. | |
So strings can be used as arrays, because strings are objects that | |
contain the __get_item__ function | |
str="abc" | |
print str[0] *will print a | |
print str[1] * will print b | |
// | |
#Parameter checking in Python: | |
1 from types import * | |
2 class MyDB: | |
3 ... | |
4 def add(self, id, name): | |
5 assert type(id) is IntType, "id is not an integer: %r" % id | |
6 assert type(name) is StringType, "name is not a string: %r" | |
% name | |
#Check if something is a list or a string: | |
#In Python3: | |
lst=[1,2,3] | |
assert not isinstance(lst, str) | |
#In Python2 | |
assert not isinstance(lst, basestring) | |
// | |
# Check if something is either a float or int | |
a='s' | |
if not isinstance(a, (int, float)): | |
print("a is either int or float") | |
// | |
#checking if something is a list: | |
if not isinstance(objs,list): | |
print "h" | |
// | |
#checking if something is a number | |
import numbers | |
isinstance('a', numbers.Number) | |
>>> False | |
// | |
*merging (concatenate) 2 lists: | |
>>> a=[1,2,3,4] | |
>>> b=[2,3,4,5] | |
>>> a+b | |
[1, 2, 3, 4, 2, 3, 4, 5] | |
// | |
*zip It is a built-in function to merge together 2 lists in the | |
following way: | |
>>> names = ['Bob','Jessica','Mary','John','Mel'] | |
>>> births = [968, 155, 77, 578, 973] | |
>>>> zip(names,births) | |
[('Bob', 968), ('Jessica', 155), ('Mary', 77), ('John', 578), ('Mel', | |
973)] | |
// | |
*Pandas: It is a module used for data analysis. | |
/ | |
*First, creating a data.frame from a certain list: | |
from pandas import DataFrame, read_csv | |
/ | |
*Panda's data structures: | |
**Series: | |
One-dimensional labeled array capable of holding any data | |
type(integers,strings, objects, etc.) | |
* iterating over Series: | |
for i, value in df['column'].mean().iteritems(): | |
print(i, value) | |
*Initializing: | |
s = pd.Series(data, index=index) | |
data can be: | |
a Python dict | |
an ndarray | |
a scalar value (like 5) | |
*Initializing from an ndarray: | |
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) | |
*Initializing from a dict: | |
d = {'a' : 0., 'b' : 1., 'c' : 2.} | |
pd.Series(d) | |
// | |
#initialize dict with keys,values from two lists | |
keys = ['a','b','c','d'] | |
values = [1,2,3,4] | |
d = dict(zip(keys, values)) | |
// | |
*Accessing elements: | |
d['a'] | |
*Operations on series: | |
In [43]: s[s>1] | |
c 2.0 | |
dtype: float64 | |
In [43]: s.mean() | |
Out[43]: 1.0 | |
*They behave like dicts: | |
s.keys() | |
*Getting unique values from panda series: | |
s.unique() | |
// | |
*removing characters from strings in a list | |
lst = [("aaaa8"),("bb8"),("ccc8"),("dddddd8")] | |
print([s.strip('8') for s in lst]) # remove the 8 from the string borders | |
print([s.replace('8', '') for s in lst]) # remove all the 8s | |
// | |
* Useful snippets on dataframes: | |
https://jeffdelaney.me/blog/useful-snippets-in-pandas/ | |
// | |
# creating a random dataframe | |
df = pandas.DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) | |
// | |
* exploring the dataframe | |
df.shape # number of rows/columns in a tuple | |
// | |
#creating a dataframe from numpy arrays: | |
data = np.array([['','Col1','Col2'], | |
['Row1',1,2], | |
['Row2',3,4]]) | |
print(pd.DataFrame(data=data[1:,1:], | |
index=data[1:,0], | |
columns=data[0,1:])) | |
And this will produce: | |
<script.py> output: | |
Col1 Col2 | |
Row1 1 2 | |
Row2 3 4 | |
// | |
* using apply to apply a function over the column of a dataframe: | |
df['A']=df['A'].apply(lambda x:x+1) | |
// | |
* Filter a dataframe based on the outcome of a function applyied on a certain column | |
In [3]: df = pandas.DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) | |
In [4]: df | |
Out[4]: | |
a b c | |
0 -0.001968 -1.877945 -1.515674 | |
1 -0.540628 0.793913 -0.983315 | |
2 -1.313574 1.946410 0.826350 | |
3 0.015763 -0.267860 -2.228350 | |
4 0.563111 1.195459 0.343168 | |
In [6]: df[df.apply(lambda x: x['b'] > x['c'], axis=1)] | |
Out[6]: | |
a b c | |
1 -0.540628 0.793913 -0.983315 | |
2 -1.313574 1.946410 0.826350 | |
3 0.015763 -0.267860 -2.228350 | |
4 0.563111 1.195459 0.343168 | |
// | |
* Difference between map, apply and applymap | |
Map: It iterates over each element of a series. | |
df[‘column1’].map(lambda x: 10+x), this will add 10 to each element of column1. | |
df[‘column2’].map(lambda x: ‘AV’+x), this will concatenate “AV“ at the beginning of each element of column2 (column format is string). | |
Apply: As the name suggests, applies a function along any axis of the DataFrame. | |
df[[‘column1’,’column2’]].apply(sum), it will returns the sum of all the values of column1 and column2. | |
ApplyMap: This helps to apply a function to each element of dataframe. | |
func = lambda x: x+2 | |
df.applymap(func), it will add 2 to each element of dataframe (all columns of dataframe must be numeric type) | |
// | |
# The inital set of baby names and bith rates | |
names = ['Bob','Jessica','Mary','John','Mel'] | |
births = [968, 155, 77, 578, 973] | |
BabyDataSet = list(zip(names,births)) | |
df = DataFrame(data = BabyDataSet, columns=['Names', 'Births']) | |
#selecting the first column of a dataframe: | |
df[[0]] | |
*adding a new columen to a dataframe | |
df['newcol']=toadd | |
*peeking the dataframe: | |
df.head() | |
*Getting a basic summary on a quantitative variable: | |
df['births'].describe() | |
*Getting the sum of a column: | |
Total = df['MyColumn'].sum() | |
print (Total) | |
319 | |
*Getting the mean of a column: | |
df['births'].mean() | |
*Median | |
df['births'].median() | |
*10th percentile | |
df['births'].quantile(0.1) | |
*90th percentile | |
df['births'].quantile(0.9) | |
# Now, export the data.frame to a csv file | |
df.to_csv('births1880.csv',index=False,header=False) | |
# read-in the data frame from csv file: | |
df=read_csv('/Users/ernesto/Google_Drive/PYTHON_LEARN/births1880.csv') | |
# read-in the data frame from csv file specifying that the first row is the header | |
df=read_csv('/Users/ernesto/Google_Drive/PYTHON_LEARN/births1880.csv',heder=1) | |
# read-in the data.frame from csv file adding column names, and | |
therefore an index | |
df=read_csv('/Users/ernesto/Google_Drive/PYTHON_LEARN/births1880.csv',names=['col1','col2','col3']) | |
# read-in the data frame from tsv file: | |
df=pd.DataFrame.from_csv('/Users/ernesto/projects/IGSR/16_12_16/cov_DF.txt', sep='\t') | |
#read-in the data frame from file and skipping comments: | |
df = pd.read_csv("DF.txt",comment='#') | |
#read-in the data frame and specifying that 2 columns are dates | |
df = pd.read_csv('pizza.csv', parse_dates=['dates']) | |
#read-in the data frame and use only some columns | |
df = pd.read_csv('pizza.csv', usecols=['foo', 'bar']) | |
# if first column is not picked, try with index_col=False | |
DF=pd.DataFrame.from_csv('/Users/ernesto/projects/IGSR/16_12_16/cov_DF.txt', sep='\t',index_col=False) | |
#read in the data frame without header | |
df=read_csv('/Users/ernesto/Google_Drive/PYTHON_LEARN/births1880.csv',header=None) | |
#knowing the data types in a dataframe | |
df.dtypes | |
#checking if a data.frame is empty: | |
if df.empty: | |
do something | |
#checking if a list is empty: | |
if not a: | |
print("List is empty") | |
#renaming column names in pandas data.frame: | |
df.columns = ['a', 'b'] | |
#pretty print a data.frame | |
from tabulate import tabulate | |
import pandas as pd | |
df = pd.DataFrame({'col_two' : [0.0001, 1e-005 , 1e-006, 1e-007], | |
'column_3' : ['ABCD', 'ABCD', 'long string', | |
'ABCD']}) | |
print tabulate(df, headers='keys', tablefmt='psql') | |
+----+-----------+-------------+ | |
| | col_two | column_3 | | |
|----+-----------+-------------| | |
| 0 | 0.0001 | ABCD | | |
| 1 | 1e-05 | ABCD | | |
| 2 | 1e-06 | long string | | |
| 3 | 1e-07 | ABCD | | |
+----+-----------+-------------+ | |
#copy a data.frame | |
surveys_copy = surveys_df.copy() | |
#slice a data.frame via indexes: | |
surveys_df.iloc[0:3, 1:4] #select 3 first rows and columns 1 to 4 | |
#slice the first 2 columns and all the rows: | |
surveys_df.iloc[:, 1:4] | |
#accessing a cell | |
surveys_df.iloc[1, 2] | |
#slice and select different non-consecutive columns | |
surveys_df.iloc[:, [0,1,4]] | |
#accessing the actual value for a certain cell i | |
p df['colname'].item() | |
#iterating over a data.frame | |
for index, row in data1.iterrows(): | |
print(row['col1']) | |
#changing the data types within a dataframe | |
df=df.astype(int) | |
>>> df | |
0 1 | |
0 Bob 968 | |
1 Jessica 155 | |
2 Mary 77 | |
3 John 578 | |
4 Mel 973 | |
#to select rows whose column value equals a scalar, some_value, use ==: | |
df.loc[df['column_name'] == some_value] | |
#to select applying more than one condition: | |
df1 = df.loc[(df.a != -1) & (df.b != -1)] | |
#selecting all rows that null values for a certain column: | |
df.loc[df['colname'].isnull()] | |
#Getting the columns names from a data.frame: | |
DF.columns | |
And the we access the first column: | |
df[0] | |
# printing the 3 first rows of the data frame | |
df[:3] | |
# Selecting by more than 1 column: | |
df[['col1',col2']] | |
*counting the values of one column if the variable is categorical: | |
df['col1'].value_counts() | |
/ | |
#filling the NA values in a dataframe | |
DF.fillna(value=0) | |
/ | |
#plotting a data.frame, one column versus the other: | |
df.plot(x='col1',y='col2') | |
/ | |
#getting the rownames of a dataframe in list format: | |
list(df.index) | |
/ | |
#drawing a barplot | |
df.plot(kind='bar') | |
/ | |
# Applying a function over a dataframe, see tutorial: | |
https://chrisalbon.com/python/pandas_apply_operations_to_dataframes.html | |
/ | |
*Counting the occurrences of one variable by the occurrence of other, | |
(similar to R's table function): | |
print pd.crosstab(df['admit'], df['prestige'], rownames=['admit']) | |
prestige 1 2 3 4 | |
admit | |
0 28 97 93 55 | |
1 33 54 28 12 | |
/ | |
*With only one column: | |
pd.crosstab(index=df['instrument'], columns="count") | |
col_0 count | |
instrument | |
AUD_USD 33 | |
EUR_USD 35 | |
USD_CAD 14 | |
/ | |
*How to make a pandas crosstab with percentages? | |
pd.crosstab(df.A, df.B).apply(lambda r: r/r.sum(), axis=1) | |
/ | |
#crosstab plus plot | |
carat_table = pd.crosstab(index=diamonds["clarity"], columns="count") | |
carat_table.plot(kind="bar", | |
figsize=(8,8)) | |
/ | |
*Return evenly spaced numbers over a specified interval | |
from numpy import linspace | |
x = linspace(-5,5,100) | |
/ | |
*Pandas conditional creation of a dataframe based on the value of one | |
column | |
import pandas as pd | |
import numpy as np | |
df=pd.DataFrame({'Type':list('ABBC'), 'Set':list('ZZXY')}) | |
df['color']=np.where(df['Set']=='Z', 'green', 'red') | |
print(df) | |
/ | |
import numpy as np | |
normally_distributed = np.random.normal(size=10000) # Generate normal data* | |
/ | |
*Pandas Dataframe columns are a Pandas Series when you pull them out, | |
which you can then call .tolist() on to turn them into a python list | |
dfList = df['one'].tolist() | |
/ | |
#preventing poping windows when plotting: | |
import matplotlib | |
matplotlib.use("Agg") | |
import matplotlib.pyplot as plt | |
/ | |
#creating a boxplot from quantitative variable using matplotlib: | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# basic plot | |
plt.boxplot(data, labels=['set1','set2']) | |
#labels=adding labels to each set | |
/ | |
#saving boxplot to file | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# basic plot | |
plt.boxplot(data) | |
plt.savefig('/Users/ernesto/projects/IGSR/18_01_17/asdf.pdf',format='pdf') | |
/ | |
#setting the axis labels and title | |
plt.xlabel("x axis", size=14) #and size also | |
plt.ylabel("y axis") | |
plt.title("caca", size=20) | |
/ | |
#setting the size of the tick labels: | |
plt.tick_params(labelsize=20) | |
/ | |
#creating a boxplot from a data.frame | |
df.boxplot() | |
/ | |
# calculating the IQR (interquartile range) and whiskers and median in a boxplot | |
import numpy as np | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
data = np.random.rand(100) | |
plt.boxplot(data) | |
median = np.median(data) | |
upper_quartile = np.percentile(data, 75) | |
lower_quartile = np.percentile(data, 25) | |
iqr = upper_quartile - lower_quartile | |
upper_whisker = data[data<=upper_quartile+1.5*iqr].max() | |
lower_whisker = data[data>=lower_quartile-1.5*iqr].min() | |
/ | |
* plotting a categorical variable (x variable) against a dependent variable (y variable): | |
See: | |
https://seaborn.pydata.org/tutorial/categorical.html | |
/ | |
#rotating the X-labels: | |
df.plot(rot=90) | |
/ | |
#changing the aspect of a boxplot | |
props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray") | |
ax=df.plot.box(grid=True,return_type='axes',color=props, patch_artist=True) | |
/ | |
#setting the color | |
df.plot(kind='bar',color="red") | |
/ | |
#applying an y limit | |
df.plot(ylim=[0,50]) | |
// | |
#setting the figure size | |
df.plot(figsize=[10,10]) | |
// | |
#creating a plot from dataframe and saving | |
ax = df.plot() | |
fig = ax.get_figure() | |
fig.savefig('asdf.png') | |
fig.savefig('/Users/ernesto/projects/IGSR/files/asdf.pdf',format='pdf') | |
#saving in pdf format | |
/ | |
Setting the xticks labels: | |
ax=subDF.plot() | |
ax.set_xticklabels(['a','b','c']) | |
/ | |
#reducing the number of x axis ticks and labels to a certain frequency (setting only | |
every n tick) | |
n = 10 | |
ax = df.plot() | |
ticks = ax.xaxis.get_ticklocs() | |
ticklabels = [l.get_text() for l in ax.xaxis.get_ticklabels()] | |
ax.xaxis.set_ticks(ticks[::n]) | |
ax.xaxis.set_ticklabels(ticklabels[::n]) | |
ax.figure.show() | |
// | |
*How to display all label values | |
ax=DF.plot() | |
ax.set_xticks(np.arange(len(DF.index))) | |
ax.set_xticklabels(DF.index) | |
// | |
*Creating a composed plot | |
import matplotlib.pyplot as plt | |
plt.figure(1) # the first figure | |
plt.subplot(311) # the first subplot in the first figure | |
plt.plot([1, 2, 3]) | |
plt.subplot(312) # the second subplot in the first figure | |
in a new row | |
plt.plot([4, 5, 6]) | |
plt.subplot(313) # the third subplot in the first figure | |
in a new row | |
plt.plot([7, 8, 9]) | |
*Where subplot(3,1,3) is nrow,ncol,fignum. The maximum value in fignum | |
will depend on nrow*ncol | |
// | |
#useful reading on groupby and applying operations on gropus: | |
https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_groups/ | |
// | |
*Group data in the DataFrame by a certain column: | |
bytreatment = data.groupby('Treatment') | |
*Then, print descriptive stats for each of the values in Treatment: | |
>>>bytreatment['RelativeFitness'].describe() | |
Treatment | |
Dish count 32.000000 | |
mean 1.456359 | |
std 0.184792 | |
min 0.955221 | |
25% 1.429005 | |
50% 1.510884 | |
75% 1.581340 | |
max 1.699276 | |
Tube count 32.000000 | |
mean 0.929589 | |
std 0.050153 | |
min 0.795107 | |
25% 0.915050 | |
50% 0.939089 | |
75% 0.953505 | |
max 1.000363 | |
dtype: float64 | |
/ | |
*Mean for each group: | |
>>>bytreatment['RelativeFitness'].mean() | |
Treatment | |
Dish 1.456359 | |
Tube 0.929589 | |
Name: RelativeFitness, dtype: float64 | |
/ | |
*Aggregating and applying different numpy functions: | |
bytreatment['RelativeFitness'].agg([np.mean,np.std,len,np.sum]) | |
/ | |
*Print a groupby dataframe: | |
import pandas as pd | |
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three', 'three', 'one'], 'B': range(6)}) | |
grouped_df = df.groupby('A') | |
for key, item in grouped_df: | |
print(grouped_df.get_group(key)) | |
// | |
* Plotting with matplotlib | |
import numpy as np | |
import matplotlib.pyplot as plt | |
*create an array | |
x=np.linspace(0,5,10) | |
y=x*x | |
fig,ax=plt.subplots() | |
ax.plot(x,y) | |
plt.show() | |
// | |
#NumPY: | |
#Create a NumPy array: | |
import numpy as np | |
import pandas as pd | |
data=np.array([['','Col1','Col2'],['Row1',1,2],['Row2',3,4]]) | |
>>>data[1:,] #print from row 1 till the end: | |
array([['Row1', '1', '2'], | |
['Row2', '3', '4']], | |
dtype='|S4') | |
>>>data[1,1] | |
'1' | |
// | |
#readin a np array from file: | |
good_set=np.fromfile("file.txt",dtype=float,sep="\n") | |
// | |
#calculating the mean on a numpy array: | |
np.mean(array) | |
// | |
#checking if elements within array are greater than some value | |
np.where(data>10) | |
// | |
# Transforming a dataset (logarithmic) | |
np.log(data) | |
// | |
# Transforming a dataset (square root) | |
np.sqrt(data) | |
// | |
#calculating max and min in an array: | |
np.amax(data) | |
np.amin(data) | |
// | |
#converting a numpy array to list: | |
data.tolist() | |
// | |
#How to append elements to a numpy array | |
A = np.array([]) | |
for row in matrix: | |
A = numpy.append(A, row) | |
// | |
#creating now a DataFrame from the Numpy array, | |
df=pd.DataFrame(data=data[1:,1:],index=data[1:,0],columns=data[0,1:]) | |
// | |
#Using Jupyter. | |
If you want to use inline matplotlib plots in this platform, use: | |
%matplotlib inline | |
// | |
#selecting several cells | |
shift+j | |
// | |
#create virtual environment for a project: | |
cd my_project_folder | |
$ virtualenv venv #where venv is the virtual env name | |
#then, start using it: | |
source venv/bin/activate | |
#when you are done: | |
deactivate | |
#once the env is active, to list the modules that are local (in the | |
env) only: | |
pip list --local | |
#Anaconda: | |
#To install a package | |
conda install packagename | |
#To install a package from a certain channel | |
conda install -c bioconda pybedtools | |
#We can upgrage packages: | |
conda update numpy | |
#Verify environment we are right now: | |
conda info --envs | |
#create a new environment with some packages installed | |
conda create --name bamqc numpy pandas | |
#activate the created environment | |
source activate bamqc | |
#deactivate the environment | |
source deactivate | |
#installing a new environment with a new python version: | |
conda create --name blahblah python=2.7 | |
#installing a new version with all python packages included in anaconda | |
conda create -n python2.7_env python=2.7 anaconda | |
#list all envs | |
conda env list | |
#remove a given env | |
conda remove --name bamqc --all | |
// | |
#correcting indentation errors, use python ~/bin/Python-2.7.12/Tools/scripts/reindent.py | |
// | |
# round all elements in a list | |
alist = [0.30000000000000004, 0.5, 0.20000000000000001] | |
my_rounded_list = [ round(elem, 2) for elem in alist ] | |
Will return: | |
[0.3, 0.5, 0.2] | |
// | |
#rounding to the nearest 10: | |
import math | |
def roundup(x): | |
return int(math.ceil(x / 10.0)) * 10 | |
# iterating through object attributes: | |
for attr, value in anobject.__dict__.iteritems(): | |
print attr, value | |
#ternary operator | |
value_when_true if condition else value_when_false | |
#For example: | |
'Yes' if fruit == 'Apple' else 'No' | |
*Enumerations in Python: | |
>>> from enum import Enum | |
>>> class Color(Enum): | |
... red = 1 | |
... green = 2 | |
... blue = 3 | |
... | |
*Enumeration members have human readable string representations: | |
>>> print(Color.red) | |
Color.red | |
*Enum members also have a property that contains just their item name: | |
>>> print(Color.red.name) | |
red | |
*Enumerations support iteration, in definition order: | |
>>> class Shake(Enum): | |
... vanilla = 7 | |
... chocolate = 4 | |
... cookies = 9 | |
... mint = 3 | |
... | |
>>> for shake in Shake: | |
... print(shake) | |
... | |
Shake.vanilla | |
Shake.chocolate | |
Shake.cookies | |
Shake.mint | |
Enumeration members are hashable, so they can be used in dictionaries and sets: | |
>>> apples = {} | |
>>> apples[Color.red] = 'red delicious' | |
>>> apples[Color.green] = 'granny smith' | |
>>> apples == {Color.red: 'red delicious', Color.green: 'granny smith'} | |
True | |
// | |
#grep on the elements of a list: | |
>>> names = ['aet2000','ppt2000', 'aet2001', 'ppt2001'] | |
>>> filter(lambda x:'aet' in x, names) | |
['aet2000', 'aet2001'] # in python 2, | |
#In python 3: | |
list(filter(lambda x:'aet' in x, names)) | |
// | |
#function to check if a string represents a number: | |
def is_number(s): | |
try: | |
float(s) | |
return True | |
except ValueError: | |
return False | |
// | |
#Creating an iterator object | |
class Fib: ① | |
def __init__(self, max): ② | |
self.max = max | |
def __iter__(self): ③ | |
self.a = 0 | |
self.b = 1 | |
return self | |
def __next__(self): ④ | |
fib = self.a | |
if fib > self.max: | |
raise StopIteration ⑤ | |
self.a, self.b = self.b, self.a + self.b | |
return fib class Fib: ① | |
def __init__(self, max): ② | |
self.max = max | |
def __iter__(self): ③ | |
self.a = 0 | |
self.b = 1 | |
return self | |
def __next__(self): ④ | |
fib = self.a | |
if fib > self.max: | |
raise StopIteration ⑤ | |
self.a, self.b = self.b, self.a + self.b | |
return fib | |
>>> from fibonacci2 import Fib | |
>>> for n in Fib(1000): | |
... print(n, end=' ') | |
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 | |
// | |
*Containers in python: | |
Containers are any object that holds an arbitrary number of other objects. Generally, containers provide a way to access the contained objects and to iterate over them. | |
Examples of containers include tuple, list, set, dict; these are the built-in containers. More container types are available in the collections module. | |
// | |
*Getting the absolute number of a number | |
abs(a) | |
// | |
#glob in python | |
>>> import glob | |
>>> glob.glob('*.pl') | |
or : | |
>>> glob.glob("/path/to/file/TSI*") | |
// | |
#glob and sort | |
for file in sorted(glob.glob("*.fastq*")): | |
print(file) | |
// | |
#iterating over 2 lists at the same time: | |
a=[1,2,3] | |
b=[2,4,6] | |
for i, j in zip(a, b): | |
print(i,j) | |
// | |
#opening a gzipped file: | |
import gzip | |
with gzip.open('ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.hg38.autosomes.maf0.01_call_rate_0.95.recoded.vcf.gz','r') as fin: | |
for line in fin: | |
print('got line', line) | |
#in this case, the line returned is in bytes format | |
#opening now and returning a text: | |
with gzip.open(sys.argv[1], 'rt') as f: | |
for line in f: | |
if line.startswith(b"#CHROM"): | |
print(line) | |
#note that #CHROM is opened as a bytes stream and not a string, so | |
this is why wee need this 'b' | |
/ | |
#converting a bytes object into str: | |
>>> b"abcde" | |
b'abcde' | |
# utf-8 is used here because it is a very common encoding, but you | |
# need to use the encoding your data is actually in. | |
>>> b"abcde".decode("utf-8") | |
'abcde' | |
/ | |
#encoding to bytes a certain str | |
b = mystring.encode('utf-8') | |
/ | |
#create a compressed gzip file | |
import gzip | |
content = "Lots of content here" | |
with gzip.open('file.txt.gz', 'wb') as f: | |
f.write(content) | |
// | |
#logging: | |
A good tutorial at http://www.blog.pythonlibrary.org/2012/08/02/python-101-an-intro-to-logging/ | |
// | |
#logging with format: | |
logging.basicConfig(filename="sample.log", filemode="w", level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') | |
// | |
*Enumeration in Python | |
from enum import Enum | |
class Color(Enum): | |
red = 1 | |
green = 2 | |
blue = 3 | |
>>>print(Color.red.name) | |
red | |
>>>print(Color.red.value) | |
1 | |
# supports iteration | |
for color in Color: | |
print color.value | |
// | |
*group a list into inclusive sequential n-tuples | |
>>> lst = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] | |
>>> zip(lst, lst[1:]) | |
[('A', 'B'), ('B', 'C'), ('C', 'D'), ('D', 'E'), ('E', 'F'), ('F', 'G')] | |
// | |
#iterating and printing all instances in a class | |
class A(object): | |
def __init__(self): | |
self.myinstatt1 = 'one' | |
self.myinstatt2 = 'two' | |
def mymethod(self): | |
pass | |
a = A() | |
for attr, value in a.__dict__.iteritems(): # this is in python2, in | |
python use 3 dict.items() | |
print attr, value | |
// | |
*checking if an element is on a list: | |
>>> a_list=['a','b','c','d'] | |
>>> 'c' in a_list | |
False | |
// | |
*How to get list index and element simultaneously: | |
for k,i in enumerate(mylist): | |
#do something with index k | |
#do something with element i | |
#creating venn diagrams: | |
#2way | |
from matplotlib import pyplot as plt | |
from matplotlib_venn import venn2, venn2_circles | |
set1 = set(['A', 'B', 'C', 'D']) | |
set2 = set(['B', 'C', 'D', 'E']) | |
venn2([set1, set2], ('Set1', 'Set2')) | |
#3way | |
from matplotlib import pyplot as plt | |
from matplotlib_venn import venn3, venn3_circles | |
set1 = set(['A', 'B', 'C', 'D']) | |
set2 = set(['B', 'C', 'D', 'E']) | |
set3 = set(['C', 'D',' E', 'F', 'G']) | |
venn3([set1, set2, set3], ('Set1', 'Set2', 'Set3')) | |
// | |
#Venn with counts: | |
from collections import Counter | |
import matplotlib.pyplot as plt | |
from matplotlib_venn import venn2, venn3 | |
%matplotlib inline | |
sets = Counter() | |
sets['01'] = 10 | |
sets['11'] = 3 | |
sets['10'] = 5 | |
setLabels = ['set1', 'set2'] | |
plt.figure() | |
ax = plt.gca() | |
v = venn2(subsets = sets, set_labels = setLabels, ax = ax) | |
plt.title('Venn Diagram') | |
plt.show() | |
// | |
#creating hist: | |
n, bins, patches = plt.hist(data, 50, normed=0, facecolor='green', | |
alpha=0.75, range=[0, 990]) | |
// | |
#adding y-label to a pyplot | |
plt.ylabel('some numbers') | |
// | |
#adding title to pyplot | |
plt.title('Histogram of IQ') | |
// | |
#adding a grid | |
plt.grid(True) | |
// | |
#importing an excel spreadsheet in Python: | |
import xlrd | |
#---------------------------------------------------------------------- | |
def open_file(path): | |
""" | |
Open and read an Excel file | |
""" | |
book = xlrd.open_workbook(path) | |
# print number of sheets | |
print book.nsheets | |
# print sheet names | |
print book.sheet_names() | |
# get the first worksheet | |
first_sheet = book.sheet_by_index(0) | |
# read a row | |
print first_sheet.row_values(0) | |
# read a cell | |
cell = first_sheet.cell(0,0) | |
print cell | |
print cell.value | |
# read a row slice | |
print first_sheet.row_slice(rowx=0, | |
start_colx=0, | |
end_colx=2) | |
#---------------------------------------------------------------------- | |
if __name__ == "__main__": | |
path = "test.xls" | |
open_file(path) | |
#using panda to load an excel file into a dataframe | |
# import modules | |
import pandas as pd | |
# Import the excel file and call it xls_file | |
xls_file = pd.ExcelFile('../data/example.xls') | |
xls_file | |
<pandas.io.excel.ExcelFile at 0x111912be0> | |
# View the excel file's sheet names | |
xls_file.sheet_names | |
['Sheet1'] | |
# Load the xls file's Sheet1 as a dataframe | |
df = xls_file.parse('Sheet1') | |
df | |
/ | |
#Load the xls file'e Sheet as a dataframe, skipping the first row: | |
df = xls_file.parse('Final QC Results',skiprows=1) | |
#Load the xls file'e Sheet as a dataframe, declaring a column as the index | |
df = xls.parse('Sheet1', index_col='Sample') | |
#Now declaring more than 1 column as the index: | |
df = xls.parse('Sheet1', index_col=[0,1]) | |
* Now enforcing the type of some of the columns: | |
df = xls.parse('Sheet1', index_col=[0,1],converters={'A': str}) | |
// | |
>>> writer = pd.ExcelWriter('output.xlsx') | |
>>> df1.to_excel(writer,'Sheet1') | |
>>> df2.to_excel(writer,'Sheet2') | |
>>> writer.save() | |
// | |
# In python 2.7, the integer division truncates the result. For example: | |
>>> 3 / 2 | |
1 | |
# Use this instead: | |
>>> from __future__ import division | |
>>> 53740/3 | |
17913.333333333332 | |
// | |
#checking if a str is a float or int: | |
import sys | |
def numeric_type(x): | |
type="" | |
try: | |
a = float(x) | |
if a.is_integer()==True: | |
type="int" | |
else: | |
type="float" | |
except ValueError: | |
type="str" | |
return type | |
s=sys.argv[1] | |
print numeric_type(s) | |
import sys | |
def numeric_type(x): | |
type="" | |
try: | |
a = float(x) | |
if a.is_integer()==True: | |
type="int" | |
else: | |
type="float" | |
return type | |
except ValueError: | |
return False | |
s=sys.argv[1] | |
print numeric_type(s) | |
// | |
#python one-liner: | |
python -c "for r in range(10): print r" | |
// | |
#what is an __init__.py file: | |
Read documentation at: | |
https://pythontips.com/2013/07/28/what-is-__init__-py/ | |
// | |
*DUMMIFY (or Convert A Categorical Variable Into Dummy Variables): | |
# import modules | |
import pandas as pd | |
# Create a dataframe | |
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], | |
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], | |
'sex': ['male', 'female', 'male', 'female', 'female']} | |
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'sex']) | |
df | |
first_name last_name sex | |
0 Jason Miller male | |
1 Molly Jacobson female | |
2 Tina Ali male | |
3 Jake Milner female | |
4 Amy Cooze female | |
# Create a set of dummy variables from the sex variable | |
df_sex = pd.get_dummies(df['sex']) | |
# Join the dummy variables to the main dataframe | |
df_new = pd.concat([df, df_sex], axis=1) | |
df_new | |
first_name last_name sex female male | |
0 Jason Miller male 0 1 | |
1 Molly Jacobson female 1 0 | |
2 Tina Ali male 0 1 | |
3 Jake Milner female 1 0 | |
4 Amy Cooze female 1 0 | |
// | |
#serializing a dataframe (store) | |
df.to_pickle(file_name) # where to save it, usually as a .pkl | |
#Then you can load it back using: | |
df = pd.read_pickle(file_name) | |
// | |
#ipython. Embedding images: | |
from IPython.display import Image | |
Image("/Users/ernesto/Desktop/20170301124551634.png") | |
// | |
*Compute the skewness of a dataset | |
scipy.stats.skew(a, axis=0, bias=True)[source]¶ | |
// | |
#generate a random string in python | |
>>> import string | |
>>> import random | |
>>> def random_generator(size=6, chars=string.ascii_uppercase + string.digits): | |
... return ''.join(random.choice(chars) for x in range(size)) | |
... | |
>>> random_generator() | |
'G5G74W' | |
>>> random_generator(3, "6793YUIO") | |
'Y3U'1 | |
// | |
#Converting a string representation of a list into an actual list object | |
>>> fruits = "['apple', 'orange', 'banana']" | |
>>> import ast | |
>>> fruits = ast.literal_eval(fruits) | |
>>> fruits | |
['apple', 'orange', 'banana'] | |
>>> fruits[1] | |
'orange' | |
// | |
#How to use the __init__.py | |
Files named __init__.py are used to mark directories on disk as Python package directories. If you have the files | |
mydir/spam/__init__.py | |
mydir/spam/module.py | |
and mydir is on your path, you can import the code in module.py as | |
import spam.module | |
or | |
from spam import module | |
#good explanation on what is a @classmethod | |
https://stackoverflow.com/questions/12179271/meaning-of-classmethod-and-staticmethod-for-beginner | |
# | |
#compare 2 lists and return matches | |
>>> a = [1, 2, 3, 4, 5] | |
>>> b = [9, 8, 7, 6, 5] | |
>>> set(a) & set(b) | |
// | |
#check if 2 lists are equal (order does not matter): | |
>>> a = [1, 2, 3, 4, 5] | |
>>> b = [9, 8, 7, 6, 5] | |
>>> set(a)==set(b) | |
// | |
#creating a temporary file and write something to it and read it from it | |
#the file is destroyed when temp.close() is called | |
import os | |
import tempfile | |
temp = tempfile.TemporaryFile() | |
try: | |
temp.write('Some data') | |
temp.seek(0) | |
print temp.read() | |
finally: | |
temp.close() | |
#creating a temporary file with a name (path) associated to it: | |
import os | |
import tempfile | |
temp = tempfile.NamedTemporaryFile() | |
try: | |
print 'temp:', temp | |
print 'temp.name:', temp.name | |
finally: | |
# Automatically cleans up the file | |
temp.close() | |
print 'Exists after close:', os.path.exists(temp.name) | |
// | |
#parse file with entries (each entry on a newline) and create a list: | |
crimefile = open('chros.txt', 'r') | |
lines = crimefile.read().splitlines() | |
// | |
temp = tempfile.NamedTemporaryFile(dir='testdir/',delete=False,prefix='caca') | |
// | |
* Running pylint: | |
Run pylint and generate a report: | |
pylint --reports=y VcfFilter.py | |
// | |
#pytest | |
###################################### | |
/ | |
#creating a simple test with a fixture | |
@pytest.fixture | |
def some_data(): | |
return 42 | |
def test_some_data(some_data): | |
assert some_data == 42 | |
#some_data() can be used by different test functions | |
/ | |
#creating tmp files | |
import os | |
def test_create_file(tmpdir): | |
p = tmpdir.mkdir("sub").join("hello.txt") | |
p.write("content") | |
assert p.read() == "content" | |
assert len(tmpdir.listdir()) == 1 | |
assert 0 | |
/ | |
#in this fixture we execute code within the fixture before the test, | |
then we pass 42 to the test with yield and finally we execute the code | |
after the yield that is executed at the end: | |
import pytest | |
import warnings | |
@pytest.fixture | |
def some_data(): | |
warnings.warn("before the test") | |
yield 42 | |
warnings.warn("executed after the test") | |
def test_some_data(some_data): | |
warnings.warn("executed in the test") | |
assert some_data == 42 | |
/ | |
pytest -s #will invoke pytest and write the output of the print commands to the terminal | |
// | |
#pytest check that raises and Exception: | |
def test_passes(): | |
with pytest.raises(Exception) as e_info: | |
x = 1 / 0 | |
######################################### | |
#dropping trailing 0s | |
a="1.00000" | |
(str(a)[-2:] == '.0' and str(a)[:-2] or str(a)) | |
1 | |
// | |
# asserting for equality of a returned list in pytest: | |
returned_list=['a','b','c'] | |
assert all([a == b for a, b in zip(returned_list, ['a','b','c'])]) | |
// | |
# how to check if a file is a directory or regular file in python? | |
os.path.isfile("bob.txt") # Does bob.txt exist? Is it a file, or a directory? | |
os.path.isdir("bob") | |
// | |
* Python: access class property from string | |
x = getattr(self, source) | |
// | |
# parse a string representing a list into a real list | |
>>> import ast | |
>>> mylist = ast.literal_eval("['foo', ['cat', ['ant', 'bee'], 'dog'], 'bar', 'baz']") | |
>>> mylist | |
['foo', ['cat', ['ant', 'bee'], 'dog'], 'bar', 'baz'] | |
// | |
#Create a List that contain each Line of a File | |
List = open("filename.txt").readlines() | |
// | |
# creating a dir if it does not exist: | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
// | |
# convert a list into a string: | |
list1 = ['1', '2', '3'] | |
str1 = ''.join(list1) | |
// | |
#good tutorial on Python decorators: | |
https://realpython.com/primer-on-python-decorators/ | |
// | |
# requests module | |
# Get a website from the internet: | |
# | |
#Create a program called anexample.py: | |
def simple_get(url): | |
""" | |
Attempts to get the content at `url` by making an HTTP GET request. | |
If the content-type of response is some kind of HTML/XML, return the | |
text content, otherwise return None | |
""" | |
try: | |
with closing(get(url, stream=True)) as resp: | |
if is_good_response(resp): | |
return resp.content | |
else: | |
return None | |
except RequestException as e: | |
log_error('Error during requests to {0} : {1}'.format(url, str(e))) | |
return None | |
def is_good_response(resp): | |
""" | |
Returns true if the response seems to be HTML, false otherwise | |
""" | |
content_type = resp.headers['Content-Type'].lower() | |
return (resp.status_code == 200 | |
and content_type is not None | |
and content_type.find('html') > -1) | |
def log_error(e): | |
""" | |
It is always a good idea to log errors. | |
This function just prints them, but you can | |
make it do anything. | |
""" | |
print(e) | |
#Then: | |
>>> from anexample import simple_get | |
>>> raw_html = simple_get('https://realpython.com/blog/') | |
>>> len(raw_html) | |
33878 | |
// | |
#adding timestamp to filename: | |
import time | |
timestr = time.strftime("%Y%m%d-%H%M%S") | |
print timestr | |
#Packaging: | |
#To create a release, your source code needs to be packaged into a single archive file. This can be done with the sdist command: | |
python setup.py sdist | |
This command will create the *.tar.gz file inside ./dist/ | |
#Then we can install it by doing: | |
pip install ./dist/package.tar.gz | |
// | |
#getters and setters (the pythonic way): | |
# Excellent post at: https://www.programiz.com/python-programming/property | |
class Celsius: | |
def __init__(self, temperature = 0): | |
self._temperature = temperature | |
def to_fahrenheit(self): | |
return (self.temperature * 1.8) + 32 | |
@property | |
def temperature(self): | |
print("Getting value") | |
return self._temperature | |
@temperature.setter | |
def temperature(self, value): | |
if value < -273: | |
raise ValueError("Temperature below -273 is not possible") | |
print("Setting value") | |
self._temperature = value | |
// | |
#Sphinx . Generating documentation: | |
/ | |
#Generate automatically doc for your source dir | |
sphinx-apidoc -o /homes/ernesto/lib/igsr_analysis/docs/ /homes/ernesto/lib/igsr_analysis/ | |
# In this case, -o sets where to put the generated doc, and the second parameter sets the path to the source code | |
// | |
#sorting a list of tuples by first element then by the second | |
unsorted = [('a', 4, 2), ('a', 4, 3), ('a', 7, 2), ('a', 7, 3), ('b', 4, 2), ('b', 4, 3), ('b', 7, 2), ('b', 7, 3)] | |
print(sorted(unsorted, key=lambda element: (element[1], element[2]))) | |
// | |
# Getting information on the different numeric Python data types | |
import numpy as np | |
int_types = ["uint8", "int8", "int16"] | |
for it in int_types: | |
print(np.iinfo(it)) | |
float_types = ["float16", "float32", "float64"] | |
for ft in float_types: | |
print(np.finfo(ft)) | |
// | |
# Memory profiling | |
#Install required module: | |
pip install -U memory_profiler | |
#Create a test function and decorate it with @profile: | |
@profile | |
def my_func(): | |
a = [1] * (10 ** 6) | |
b = [2] * (2 * 10 ** 7) | |
del b | |
return a | |
if __name__ == '__main__': | |
my_func() | |
#Finally run the test script in the following way: | |
python -m memory_profiler test.py | |
// | |
# initializing several lists at the same time: | |
alist, blist, clist, dlist, elist = ([] for i in range(5)) | |
// | |
# Generating tables in python | |
from prettytable import PrettyTable | |
x = PrettyTable() | |
# we set the header | |
x.field_names = ["City name", "Area", "Population", "Annual Rainfall"] | |
# we add values | |
x.add_row(["Adelaide", 1295, 1158259, 600.5]) | |
x.add_row(["Brisbane", 5905, 1857594, 1146.4]) | |
print(x) | |
+-----------+------+------------+-----------------+ | |
| City name | Area | Population | Annual Rainfall | | |
+-----------+------+------------+-----------------+ | |
| Adelaide | 1295 | 1158259 | 600.5 | | |
| Brisbane | 5905 | 1857594 | 1146.4 | | |
| Darwin | 112 | 120900 | 1714.7 | | |
| Hobart | 1357 | 205556 | 619.5 | | |
| Sydney | 2058 | 4336374 | 1214.8 | | |
| Melbourne | 1566 | 3806092 | 646.9 | | |
| Perth | 5386 | 1554769 | 869.4 | | |
+-----------+------+------------+-----------------+ | |
// | |
#checking if an int is within 2 numbers: | |
if 10000 <= number <= 30000: | |
// | |
* check if something is list | |
x = {'a', 'b', 'c', 'd'} | |
if type(x) is list: | |
print("h") | |
// | |
# How to check if an object has an attribute in Python: | |
hasattr(self, 'start') | |
True | |
// | |
#append elements to beginning of list : | |
>>> a = ['a','b'] | |
>>> k = ['nice', '-n', '10'] | |
>>> a[0:0] = k | |
>>> a | |
['nice', '-n', '10', 'a', 'b'] | |
// | |
#remove several elements from a list using its index | |
item_list = ['item', 5, 'foo', 3.14, True] | |
item_list = [e for e in item_list if e not in ('item', 5)] | |
// | |
# generating ranges in python | |
#Create a sequence of numbers from 3 to 5, and print each item in the sequence: | |
x = range(3, 6) | |
for n in x: | |
print(n) | |
#Create a sequence of numbers from 3 to 19, but increment by 2 instead of 1: | |
x = range(3, 20, 2) | |
for n in x: | |
print(n) | |
* Article on how to manage configuration files in python: | |
https://hackernoon.com/4-ways-to-manage-the-configuration-in-python-4623049e841b | |
// | |
# remove several elements from a list | |
indices = [0, 2] | |
somelist = [i for j, i in enumerate(somelist) if j not in indices] | |
// | |
# Iterating over every two elements in a list | |
l = [1,2,3,4,5,6] | |
def pairwise(iterable): | |
"s -> (s0, s1), (s2, s3), (s4, s5), ..." | |
a = iter(iterable) | |
return zip(a, a) | |
for x, y in pairwise(l): | |
print("{0} + {1} = {2}".format(x, y, x + y)) | |
// | |
# Making a Python script executable: | |
1) Add this line as the first line in the script: #!/usr/bin/env python3. | |
2) At the unix command prompt, type the following to make myscript.py executable: $ chmod +x myscript.py. | |
// | |
* Previous and next values inside a loop | |
Example extracted from: | |
https://stackoverflow.com/questions/1011938/python-previous-and-next-values-inside-a-loop | |
from itertools import tee, islice, chain, izip | |
def previous_and_next(some_iterable): | |
prevs, items, nexts = tee(some_iterable, 3) | |
prevs = chain([None], prevs) | |
nexts = chain(islice(nexts, 1, None), [None]) | |
return izip(prevs, items, nexts) | |
mylist = ['banana', 'orange', 'apple', 'kiwi', 'tomato'] | |
for previous, item, nxt in previous_and_next(mylist): | |
print "Item is now", item, "next is", nxt, "previous is", previous | |
The results: | |
Item is now banana next is orange previous is None | |
Item is now orange next is apple previous is banana | |
Item is now apple next is kiwi previous is orange | |
Item is now kiwi next is tomato previous is apple | |
Item is now tomato next is None previous is kiwi | |
// | |
# Excellent articule on decorators: | |
https://realpython.com/primer-on-python-decorators/#simple-decorators | |
// | |
#knowing PYTHONPATH from python script | |
import os | |
try: | |
user_paths = os.environ['PYTHONPATH'].split(os.pathsep) | |
print(user_paths) | |
except KeyError: | |
user_paths = [] | |
// | |
# Excellent artickle on how to submit a Python project to PyPi | |
https://dzone.com/articles/executable-package-pip-install | |
// | |
# How to use glob() to find files recursively? | |
from pathlib import Path | |
for filename in Path('src').rglob('*.c'): | |
print(filename) | |
// | |
# Convert Python2 code to Python3 | |
# Where test.py contains the code to modify, with -w we will | |
# print out on the same file | |
2to3 -w test.py | |
// | |
# Excellent article on using the main entry point in Python | |
https://realpython.com/python-main-function/ | |
// | |
# Python shebang or hash bang: | |
#!/usr/bin/python: writing the absolute path | |
#!/usr/bin/env python: using the operating system env command, which locates | |
and executes Python by searching the PATH environment variable | |
// | |
main entry point with args parsing: | |
import argparse | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('file_in', help='input file') | |
parser.add_argument('file_out', help='output file') | |
args = parser.parse_args() | |
execute_code(args.file_in, args.file_out) | |
if __name__ == '__main__': | |
main() | |
// | |
# Splitting a string by one or more whitespaces | |
import re | |
str = '63 41 92 81 69 70' | |
#split string by single space | |
chunks = re.split(' +', str) | |
print(chunks) | |
// | |
# Testing if a binary exists | |
def is_tool(name): | |
"""Check whether `name` is on PATH and marked as executable.""" | |
# from whichcraft import which | |
from shutil import which | |
return which(name) is not None | |
// | |
% modulo operator | |
a = 9 | |
b = 3 | |
answer = a % b | |
print(answer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python program to explain shutil.which() method | |
# check if python is in PATH | |
# importing os module | |
import os | |
# importing shutil module | |
import shutil | |
# cmd | |
cmd = 'python' | |
# Using shutil.which() method | |
locate = shutil.which(cmd) | |
# Print result | |
print(locate) # outputs: /usr/bin/python |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment