karpan karpanGit

## pandas_ungroup.py
# pandas ungroup operation

import pandas as pd
# create example dataframe df1.loc[3,'b'] contains duplicate values that will be dropped
df1 = pd.DataFrame({'a':[1,2,3,4],'b':['a;b;c','g','j;w','h;j;h']})
print(df1)
'''
   a      b
0  1  a;b;c
1  2      g

## starOperators.py
# * and ** operators in python

data1 = [1,2,3]
print(data1)
print(*data1)
'''
[1, 2, 3]
1 2 3
'''
# join two lists

## regression_scatter plot in seaborn.py
# regression/scatter plot in seaborn

import numpy as np
import seaborn as sns
import pandas as pd
df = pd.DataFrame(np.random.randn(100,2),columns=['a','b'])
df['b'] = df['a'] + df['b']/2.
sns.regplot(x='a', y='b', data = df)

## dictionary from groups.py
# pandas groupby, create a ditionary with groups using a comprehension
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1': ['a','a','b','b','a'],
                   'key2': ['one', 'two', 'one', 'two', 'three'],
                   'data1': np.random.randn(5),
                   'data2': np.random.random(5)})
# by default the groupby selects all remaining columns
groups = df.groupby('key1')
res = {key1: group for key1, group in groups}

## truncate string columns.py
# groupby over all columns that are strings and truncate them; useful in case we plan to output to excel
import pandas as pd
import numpy as np
maxChars = 2 # maximum number of characters to retain
df = pd.DataFrame({'key1': ['a','a','b','b','a'],
                   'key2': ['one', 'two', 'one', 'two', 'three'],
                   'data1': np.random.randn(5),
                   'data2': np.random.random(5)})
print(df)
# iterate over columns that are type object

## set pandas options.py
# pandas options
# https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html

import pandas as pd

# do not fold dataframes
pd.set_option('expand_frame_repr',False)
# maximum number of columns
pd.set_option("display.max_columns",20)
# maximum number of rows

## aggregation timings.py
# time built in and custom written aggregation functions

import pandas as pd
import numpy as np
N = 1000000
df = pd.DataFrame({'a': np.random.randn(N), 'key1':['a']*int(N/2)+['b']*int(N/2)})
def aggrTest1():
    res = df.groupby('key1').sum()
def aggrTest2():
    res = df.groupby('key1').agg(lambda x: x.sum())

## linear regression groups.py
# linear regression per group

# create dataset
import pandas as pd
import numpy as np
ns = np.random.randn(100)
df1 = pd.DataFrame({'x': ns + 0.05*np.random.randn(100), 'y':2.5*ns -1 + 0.05*np.random.randn(100)})
df2 = pd.DataFrame({'x': ns + 0.05*np.random.randn(100), 'y':1.5*ns + 1 + 0.05*np.random.randn(100)})
df = pd.concat([df1, df2], axis='index',keys=['a', 'b']).droplevel(level=1).reset_index().rename({'index':'key'}, axis='columns')
# the two groups have approximately the following (slope, intercept): (2.5, -1), (1.5, 1)

## shallowDeepCopy.py
# copy lists in python
a = [1, 2, [3],[[4]]]
print('a'.ljust(20, ' '),id(a), id(a[2]), id(a[2][0]), id(a[3][0]))
# not a copy
b = a
print('b (not a copy)'.ljust(20, ' '), id(b), id(b[2]), id(b[2][0]), id(b[3][0]))
# a shallow copy I
b = a[:]
print('b (shallow copy I)'.ljust(20, ' '), id(b), id(b[2]), id(b[2][0]), id(b[3][0]))
# a shallow copy II

## timezones python.py
# working with timezones
from datetime import datetime
# pytz is required for timezone aware datetime and time objects
import pytz


# help function
def checkAwareness(dt):
    if dt.tzinfo is None:
        print(f'datetime is naive (tzinfo is {dt.tzinfo})')
	# pandas ungroup operation

	import pandas as pd
	# create example dataframe df1.loc[3,'b'] contains duplicate values that will be dropped
	df1 = pd.DataFrame({'a':[1,2,3,4],'b':['a;b;c','g','j;w','h;j;h']})
	print(df1)
	'''
	a b
	0 1 a;b;c
	1 2 g
	# * and ** operators in python

	data1 = [1,2,3]
	print(data1)
	print(*data1)
	'''
	[1, 2, 3]
	1 2 3
	'''
	# join two lists
	# regression/scatter plot in seaborn

	import numpy as np
	import seaborn as sns
	import pandas as pd
	df = pd.DataFrame(np.random.randn(100,2),columns=['a','b'])
	df['b'] = df['a'] + df['b']/2.
	sns.regplot(x='a', y='b', data = df)
	# pandas groupby, create a ditionary with groups using a comprehension
	import pandas as pd
	import numpy as np
	df = pd.DataFrame({'key1': ['a','a','b','b','a'],
	'key2': ['one', 'two', 'one', 'two', 'three'],
	'data1': np.random.randn(5),
	'data2': np.random.random(5)})
	# by default the groupby selects all remaining columns
	groups = df.groupby('key1')
	res = {key1: group for key1, group in groups}
	# groupby over all columns that are strings and truncate them; useful in case we plan to output to excel
	import pandas as pd
	import numpy as np
	maxChars = 2 # maximum number of characters to retain
	df = pd.DataFrame({'key1': ['a','a','b','b','a'],
	'key2': ['one', 'two', 'one', 'two', 'three'],
	'data1': np.random.randn(5),
	'data2': np.random.random(5)})
	print(df)
	# iterate over columns that are type object
	# pandas options
	# https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html

	import pandas as pd

	# do not fold dataframes
	pd.set_option('expand_frame_repr',False)
	# maximum number of columns
	pd.set_option("display.max_columns",20)
	# maximum number of rows
	# time built in and custom written aggregation functions

	import pandas as pd
	import numpy as np
	N = 1000000
	df = pd.DataFrame({'a': np.random.randn(N), 'key1':['a']int(N/2)+['b']int(N/2)})
	def aggrTest1():
	res = df.groupby('key1').sum()
	def aggrTest2():
	res = df.groupby('key1').agg(lambda x: x.sum())
	# linear regression per group

	# create dataset
	import pandas as pd
	import numpy as np
	ns = np.random.randn(100)
	df1 = pd.DataFrame({'x': ns + 0.05np.random.randn(100), 'y':2.5ns -1 + 0.05*np.random.randn(100)})
	df2 = pd.DataFrame({'x': ns + 0.05np.random.randn(100), 'y':1.5ns + 1 + 0.05*np.random.randn(100)})
	df = pd.concat([df1, df2], axis='index',keys=['a', 'b']).droplevel(level=1).reset_index().rename({'index':'key'}, axis='columns')
	# the two groups have approximately the following (slope, intercept): (2.5, -1), (1.5, 1)
	# copy lists in python
	a = [1, 2, [3],[[4]]]
	print('a'.ljust(20, ' '),id(a), id(a[2]), id(a[2][0]), id(a[3][0]))
	# not a copy
	b = a
	print('b (not a copy)'.ljust(20, ' '), id(b), id(b[2]), id(b[2][0]), id(b[3][0]))
	# a shallow copy I
	b = a[:]
	print('b (shallow copy I)'.ljust(20, ' '), id(b), id(b[2]), id(b[2][0]), id(b[3][0]))
	# a shallow copy II
	# working with timezones
	from datetime import datetime
	# pytz is required for timezone aware datetime and time objects
	import pytz


	# help function
	def checkAwareness(dt):
	if dt.tzinfo is None:
	print(f'datetime is naive (tzinfo is {dt.tzinfo})')