Skip to content

Instantly share code, notes, and snippets.

@karpanGit
karpanGit / pandas_ungroup.py
Last active November 3, 2019 17:23
pandas: ungrouping concatenated cells in a column of a pandas dataframe
# pandas ungroup operation
import pandas as pd
# create example dataframe df1.loc[3,'b'] contains duplicate values that will be dropped
df1 = pd.DataFrame({'a':[1,2,3,4],'b':['a;b;c','g','j;w','h;j;h']})
print(df1)
'''
a b
0 1 a;b;c
1 2 g
@karpanGit
karpanGit / starOperators.py
Last active January 20, 2020 20:00
* and ** operators in python (star, double star operators)
# * and ** operators in python
data1 = [1,2,3]
print(data1)
print(*data1)
'''
[1, 2, 3]
1 2 3
'''
# join two lists
@karpanGit
karpanGit / regression_scatter plot in seaborn.py
Last active November 2, 2019 08:12
regression/scatter plot in seaborn
# regression/scatter plot in seaborn
import numpy as np
import seaborn as sns
import pandas as pd
df = pd.DataFrame(np.random.randn(100,2),columns=['a','b'])
df['b'] = df['a'] + df['b']/2.
sns.regplot(x='a', y='b', data = df)
@karpanGit
karpanGit / dictionary from groups.py
Last active November 22, 2023 00:58
pandas: create a dictionary containing groups following a groupby operation
# pandas groupby, create a ditionary with groups using a comprehension
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1': ['a','a','b','b','a'],
'key2': ['one', 'two', 'one', 'two', 'three'],
'data1': np.random.randn(5),
'data2': np.random.random(5)})
# by default the groupby selects all remaining columns
groups = df.groupby('key1')
res = {key1: group for key1, group in groups}
@karpanGit
karpanGit / truncate string columns.py
Last active November 3, 2019 17:22
pandas: truncate string columns of pandas dataframe
# groupby over all columns that are strings and truncate them; useful in case we plan to output to excel
import pandas as pd
import numpy as np
maxChars = 2 # maximum number of characters to retain
df = pd.DataFrame({'key1': ['a','a','b','b','a'],
'key2': ['one', 'two', 'one', 'two', 'three'],
'data1': np.random.randn(5),
'data2': np.random.random(5)})
print(df)
# iterate over columns that are type object
@karpanGit
karpanGit / set pandas options.py
Created November 3, 2019 19:13
pandas: useful options
# pandas options
# https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html
import pandas as pd
# do not fold dataframes
pd.set_option('expand_frame_repr',False)
# maximum number of columns
pd.set_option("display.max_columns",20)
# maximum number of rows
@karpanGit
karpanGit / aggregation timings.py
Created November 3, 2019 19:23
pandas: timing different aggregation implementations
# time built in and custom written aggregation functions
import pandas as pd
import numpy as np
N = 1000000
df = pd.DataFrame({'a': np.random.randn(N), 'key1':['a']*int(N/2)+['b']*int(N/2)})
def aggrTest1():
res = df.groupby('key1').sum()
def aggrTest2():
res = df.groupby('key1').agg(lambda x: x.sum())
@karpanGit
karpanGit / linear regression groups.py
Created November 9, 2019 17:47
pandas: apply linear regression to groups
# linear regression per group
# create dataset
import pandas as pd
import numpy as np
ns = np.random.randn(100)
df1 = pd.DataFrame({'x': ns + 0.05*np.random.randn(100), 'y':2.5*ns -1 + 0.05*np.random.randn(100)})
df2 = pd.DataFrame({'x': ns + 0.05*np.random.randn(100), 'y':1.5*ns + 1 + 0.05*np.random.randn(100)})
df = pd.concat([df1, df2], axis='index',keys=['a', 'b']).droplevel(level=1).reset_index().rename({'index':'key'}, axis='columns')
# the two groups have approximately the following (slope, intercept): (2.5, -1), (1.5, 1)
@karpanGit
karpanGit / shallowDeepCopy.py
Last active October 25, 2020 08:21
shallow vs deep copies in python
# copy lists in python
a = [1, 2, [3],[[4]]]
print('a'.ljust(20, ' '),id(a), id(a[2]), id(a[2][0]), id(a[3][0]))
# not a copy
b = a
print('b (not a copy)'.ljust(20, ' '), id(b), id(b[2]), id(b[2][0]), id(b[3][0]))
# a shallow copy I
b = a[:]
print('b (shallow copy I)'.ljust(20, ' '), id(b), id(b[2]), id(b[2][0]), id(b[3][0]))
# a shallow copy II
@karpanGit
karpanGit / timezones python.py
Created November 17, 2019 14:24
timezones in python
# working with timezones
from datetime import datetime
# pytz is required for timezone aware datetime and time objects
import pytz
# help function
def checkAwareness(dt):
if dt.tzinfo is None:
print(f'datetime is naive (tzinfo is {dt.tzinfo})')