This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pandas ungroup operation | |
import pandas as pd | |
# create example dataframe df1.loc[3,'b'] contains duplicate values that will be dropped | |
df1 = pd.DataFrame({'a':[1,2,3,4],'b':['a;b;c','g','j;w','h;j;h']}) | |
print(df1) | |
''' | |
a b | |
0 1 a;b;c | |
1 2 g |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# * and ** operators in python | |
data1 = [1,2,3] | |
print(data1) | |
print(*data1) | |
''' | |
[1, 2, 3] | |
1 2 3 | |
''' | |
# join two lists |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# regression/scatter plot in seaborn | |
import numpy as np | |
import seaborn as sns | |
import pandas as pd | |
df = pd.DataFrame(np.random.randn(100,2),columns=['a','b']) | |
df['b'] = df['a'] + df['b']/2. | |
sns.regplot(x='a', y='b', data = df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pandas groupby, create a ditionary with groups using a comprehension | |
import pandas as pd | |
import numpy as np | |
df = pd.DataFrame({'key1': ['a','a','b','b','a'], | |
'key2': ['one', 'two', 'one', 'two', 'three'], | |
'data1': np.random.randn(5), | |
'data2': np.random.random(5)}) | |
# by default the groupby selects all remaining columns | |
groups = df.groupby('key1') | |
res = {key1: group for key1, group in groups} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# groupby over all columns that are strings and truncate them; useful in case we plan to output to excel | |
import pandas as pd | |
import numpy as np | |
maxChars = 2 # maximum number of characters to retain | |
df = pd.DataFrame({'key1': ['a','a','b','b','a'], | |
'key2': ['one', 'two', 'one', 'two', 'three'], | |
'data1': np.random.randn(5), | |
'data2': np.random.random(5)}) | |
print(df) | |
# iterate over columns that are type object |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pandas options | |
# https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html | |
import pandas as pd | |
# do not fold dataframes | |
pd.set_option('expand_frame_repr',False) | |
# maximum number of columns | |
pd.set_option("display.max_columns",20) | |
# maximum number of rows |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# time built in and custom written aggregation functions | |
import pandas as pd | |
import numpy as np | |
N = 1000000 | |
df = pd.DataFrame({'a': np.random.randn(N), 'key1':['a']*int(N/2)+['b']*int(N/2)}) | |
def aggrTest1(): | |
res = df.groupby('key1').sum() | |
def aggrTest2(): | |
res = df.groupby('key1').agg(lambda x: x.sum()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# linear regression per group | |
# create dataset | |
import pandas as pd | |
import numpy as np | |
ns = np.random.randn(100) | |
df1 = pd.DataFrame({'x': ns + 0.05*np.random.randn(100), 'y':2.5*ns -1 + 0.05*np.random.randn(100)}) | |
df2 = pd.DataFrame({'x': ns + 0.05*np.random.randn(100), 'y':1.5*ns + 1 + 0.05*np.random.randn(100)}) | |
df = pd.concat([df1, df2], axis='index',keys=['a', 'b']).droplevel(level=1).reset_index().rename({'index':'key'}, axis='columns') | |
# the two groups have approximately the following (slope, intercept): (2.5, -1), (1.5, 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# copy lists in python | |
a = [1, 2, [3],[[4]]] | |
print('a'.ljust(20, ' '),id(a), id(a[2]), id(a[2][0]), id(a[3][0])) | |
# not a copy | |
b = a | |
print('b (not a copy)'.ljust(20, ' '), id(b), id(b[2]), id(b[2][0]), id(b[3][0])) | |
# a shallow copy I | |
b = a[:] | |
print('b (shallow copy I)'.ljust(20, ' '), id(b), id(b[2]), id(b[2][0]), id(b[3][0])) | |
# a shallow copy II |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# working with timezones | |
from datetime import datetime | |
# pytz is required for timezone aware datetime and time objects | |
import pytz | |
# help function | |
def checkAwareness(dt): | |
if dt.tzinfo is None: | |
print(f'datetime is naive (tzinfo is {dt.tzinfo})') |
OlderNewer