Skip to content

Instantly share code, notes, and snippets.

@captainsafia
Last active April 24, 2017 20:01
Show Gist options
  • Save captainsafia/49d475947ffbeb118e4c1dd756a2201a to your computer and use it in GitHub Desktop.
Save captainsafia/49d475947ffbeb118e4c1dd756a2201a to your computer and use it in GitHub Desktop.
In [1]: # What's the fastest way to convert a column of minutes to hours?
In [2]: import pandas as pd
In [3]: import numpy as np
In [4]: minutes = pd.Series(np.random.randint(0, 1440, size=(20000)))
In [5]: minutes.shape
Out[5]: (20000,)
In [6]: minutes.head(5)
Out[6]:
0 901
1 263
2 726
3 971
4 313
dtype: int64
In [7]: 901 / 60
Out[7]: 15
In [8]: 901 / 60.0
Out[8]: 15.016666666666667
In [9]: minutes.apply(lambda x: x / 60.0)
Out[9]:
0 15.016667
1 4.383333
2 12.100000
3 16.183333
4 5.216667
5 16.533333
6 0.433333
7 19.500000
8 23.750000
9 13.150000
10 22.216667
11 20.883333
12 11.683333
13 18.816667
14 8.916667
15 19.416667
16 9.500000
17 4.366667
18 4.750000
19 19.366667
20 18.783333
21 18.183333
22 23.783333
23 6.533333
24 8.633333
25 17.866667
26 12.833333
27 19.950000
28 8.533333
29 3.433333
...
19970 12.266667
19971 19.183333
19972 1.683333
19973 4.466667
19974 20.666667
19975 19.083333
19976 5.200000
19977 4.733333
19978 9.383333
19979 9.483333
19980 8.300000
19981 2.083333
19982 4.583333
19983 6.916667
19984 3.616667
19985 21.083333
19986 4.416667
19987 1.933333
19988 19.800000
19989 18.350000
19990 15.233333
19991 12.366667
19992 7.250000
19993 21.583333
19994 1.983333
19995 4.566667
19996 23.433333
19997 13.750000
19998 4.433333
19999 10.216667
dtype: float64
In [10]: %timeit mi
min minutes
In [10]: %timeit minutes.apply(lambda x: x / 60.0)
100 loops, best of 3: 5.49 ms per loop
In [11]: minutes.map(lambda x: x / 60.0)
Out[11]:
0 15.016667
1 4.383333
2 12.100000
3 16.183333
4 5.216667
5 16.533333
6 0.433333
7 19.500000
8 23.750000
9 13.150000
10 22.216667
11 20.883333
12 11.683333
13 18.816667
14 8.916667
15 19.416667
16 9.500000
17 4.366667
18 4.750000
19 19.366667
20 18.783333
21 18.183333
22 23.783333
23 6.533333
24 8.633333
25 17.866667
26 12.833333
27 19.950000
28 8.533333
29 3.433333
...
19970 12.266667
19971 19.183333
19972 1.683333
19973 4.466667
19974 20.666667
19975 19.083333
19976 5.200000
19977 4.733333
19978 9.383333
19979 9.483333
19980 8.300000
19981 2.083333
19982 4.583333
19983 6.916667
19984 3.616667
19985 21.083333
19986 4.416667
19987 1.933333
19988 19.800000
19989 18.350000
19990 15.233333
19991 12.366667
19992 7.250000
19993 21.583333
19994 1.983333
19995 4.566667
19996 23.433333
19997 13.750000
19998 4.433333
19999 10.216667
dtype: float64
In [12]: %timeit minutes.map(lambda x: x / 60.0)
100 loops, best of 3: 5.59 ms per loop
In [13]: minutes / 60.0
Out[13]:
0 15.016667
1 4.383333
2 12.100000
3 16.183333
4 5.216667
5 16.533333
6 0.433333
7 19.500000
8 23.750000
9 13.150000
10 22.216667
11 20.883333
12 11.683333
13 18.816667
14 8.916667
15 19.416667
16 9.500000
17 4.366667
18 4.750000
19 19.366667
20 18.783333
21 18.183333
22 23.783333
23 6.533333
24 8.633333
25 17.866667
26 12.833333
27 19.950000
28 8.533333
29 3.433333
...
19970 12.266667
19971 19.183333
19972 1.683333
19973 4.466667
19974 20.666667
19975 19.083333
19976 5.200000
19977 4.733333
19978 9.383333
19979 9.483333
19980 8.300000
19981 2.083333
19982 4.583333
19983 6.916667
19984 3.616667
19985 21.083333
19986 4.416667
19987 1.933333
19988 19.800000
19989 18.350000
19990 15.233333
19991 12.366667
19992 7.250000
19993 21.583333
19994 1.983333
19995 4.566667
19996 23.433333
19997 13.750000
19998 4.433333
19999 10.216667
dtype: float64
In [14]: %timeit minutes / 60.0
10000 loops, best of 3: 109 µs per loop
In [15]: # map = element wise; apply = row/column wise
In [16]: hours = minutes / 60.0
In [17]: hours.head(5)
Out[17]:
0 15.016667
1 4.383333
2 12.100000
3 16.183333
4 5.216667
dtype: float64
In [18]: minutes = pd.Series(np.random.randint(0, 1440, size=(1000000)))
In [19]: %timeit minutes / 60.0
100 loops, best of 3: 4.58 ms per loop
In [20]: %timeit minutes.apply(lambda x: x / 60.0)
1 loop, best of 3: 280 ms per loop
In [21]: %timeit minutes.map(lambda x: x / 60.0)
1 loop, best of 3: 279 ms per loop
In [22]: %timeit minutes = minutes / 60.0
---------------------------------------------------------------------------
UnboundLocalError Traceback (most recent call last)
<ipython-input-22-f16539e491d2> in <module>()
----> 1 get_ipython().magic(u'timeit minutes = minutes / 60.0')
/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2161 magic_name, _, magic_arg_s = arg_s.partition(' ')
2162 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163 return self.run_line_magic(magic_name, magic_arg_s)
2164
2165 #-------------------------------------------------------------------------
/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2082 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2083 with self.builtin_trap:
-> 2084 result = fn(*args,**kwargs)
2085 return result
2086
<decorator-gen-59> in timeit(self, line, cell)
/usr/local/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
/usr/local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in timeit(self, line, cell)
1039 number = 1
1040 for _ in range(1, 10):
-> 1041 time_number = timer.timeit(number)
1042 worst_tuning = max(worst_tuning, time_number / number)
1043 if time_number >= 0.2:
/usr/local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in timeit(self, number)
135 gc.disable()
136 try:
--> 137 timing = self.inner(it, self.timer)
138 finally:
139 if gcold:
<magic-timeit> in inner(_it, _timer)
UnboundLocalError: local variable 'minutes' referenced before assignment
In [23]: %timeit hours = minutes / 60.0
100 loops, best of 3: 4.86 ms per loop
In [24]: %timeit minutes / 60.0
100 loops, best of 3: 4.65 ms per loop
In [25]: minutes
Out[25]:
0 570
1 592
2 570
3 688
4 36
5 243
6 252
7 1194
8 1240
9 1048
10 945
11 204
12 143
13 503
14 734
15 1356
16 462
17 996
18 304
19 354
20 1235
21 1106
22 780
23 14
24 178
25 424
26 1402
27 211
28 141
29 291
...
999970 1243
999971 462
999972 1081
999973 1272
999974 89
999975 551
999976 1278
999977 1188
999978 1403
999979 1158
999980 1429
999981 480
999982 1333
999983 586
999984 847
In [28]: minutes
Out[28]:
0 570
1 592
2 570
3 688
4 36
5 243
6 252
7 1194
8 1240
9 1048
10 945
11 204
12 143
13 503
14 734
15 1356
16 462
17 996
18 304
19 354
20 1235
21 1106
22 780
23 14
24 178
25 424
26 1402
27 211
28 141
29 291
...
999970 1243
999971 462
999972 1081
999973 1272
999974 89
999975 551
999976 1278
999977 1188
999978 1403
999979 1158
999980 1429
999981 480
999982 1333
999983 586
999984 847
999985 209
999986 147
999987 354
999988 232
999989 621
999990 659
999991 1047
999992 923
999993 42
999994 916
999995 834
999996 336
999997 1388
999998 1410
999999 1083
dtype: int64
In [29]: minutes = pd.Series(np.random.randint(0, 1440, size=(500000)))
In [30]: minutes <= 800
Out[30]:
0 True
1 False
2 True
3 False
4 True
5 True
6 True
7 True
8 False
9 False
10 True
11 False
12 False
13 False
14 True
15 True
16 False
17 True
18 False
19 True
20 False
21 False
22 False
23 False
24 True
25 True
26 False
27 True
28 False
29 False
...
499970 True
499971 True
499972 False
499973 True
499974 False
499975 True
499976 False
499977 False
499978 True
499979 True
499980 True
499981 False
499982 True
499983 False
499984 False
499985 False
499986 True
499987 True
499988 False
499989 True
499990 True
499991 False
499992 True
499993 False
499994 False
499995 False
499996 True
499997 False
499998 True
499999 True
dtype: bool
In [31]: minutes[minutes <= 800]
Out[31]:
0 777
2 462
4 699
5 291
6 420
7 418
10 169
14 216
15 438
17 176
19 3
24 55
25 762
27 343
31 485
32 258
34 779
37 334
38 766
39 666
42 267
43 131
45 646
46 92
48 144
49 563
50 254
53 237
54 650
61 269
...
499952 532
499954 757
499955 326
499956 396
499957 64
499960 747
499961 761
499962 587
499964 671
499965 662
499966 335
499967 113
499968 138
499969 605
499970 358
499971 157
499973 600
499975 208
499978 519
499979 705
499980 140
499982 733
499986 195
499987 35
499989 304
499990 738
499992 666
499996 567
499998 93
499999 38
dtype: int64
In [32]: minutes.drop(minutes <= 800)
Out[32]:
2 462
3 1297
4 699
5 291
6 420
7 418
8 1075
9 895
10 169
11 1338
12 855
13 1306
14 216
15 438
16 1152
17 176
18 1352
19 3
20 1014
21 1322
22 1171
23 912
24 55
25 762
26 1257
27 343
28 950
29 1151
30 1262
31 485
...
499970 358
499971 157
499972 855
499973 600
499974 833
499975 208
499976 1289
499977 987
499978 519
499979 705
499980 140
499981 1140
499982 733
499983 1009
499984 1019
499985 1388
499986 195
499987 35
499988 1100
499989 304
499990 738
499991 1253
499992 666
499993 1274
499994 970
499995 1183
499996 567
499997 1049
499998 93
499999 38
dtype: int64
In [36]: minutes = pd.DataFrame(np.random.randint(0, 1440, size=(500000, 3)), columns='ABC')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-36-97597bebafc5> in <module>()
----> 1 minutes = pd.DataFrame(np.random.randint(0, 1440, size=(500000, 3)), columns='ABC')
/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
253 else:
254 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 255 copy=copy)
256 elif isinstance(data, (list, types.GeneratorType)):
257 if isinstance(data, types.GeneratorType):
/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_ndarray(self, values, index, columns, dtype, copy)
421 raise_with_traceback(e)
422
--> 423 index, columns = _get_axes(*values.shape)
424 values = values.T
425
/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _get_axes(N, K, index, columns)
388 columns = _default_index(K)
389 else:
--> 390 columns = _ensure_index(columns)
391 return index, columns
392
/usr/local/lib/python2.7/site-packages/pandas/indexes/base.pyc in _ensure_index(index_like, copy)
3407 index_like = copy(index_like)
3408
-> 3409 return Index(index_like)
3410
3411
/usr/local/lib/python2.7/site-packages/pandas/indexes/base.pyc in __new__(cls, data, dtype, copy, name, fastpath, tupleize_cols, **kwargs)
266 **kwargs)
267 elif data is None or lib.isscalar(data):
--> 268 cls._scalar_data_error(data)
269 else:
270 if (tupleize_cols and isinstance(data, list) and data and
/usr/local/lib/python2.7/site-packages/pandas/indexes/base.pyc in _scalar_data_error(cls, data)
481 raise TypeError('{0}(...) must be called with a collection of some '
482 'kind, {1} was passed'.format(cls.__name__,
--> 483 repr(data)))
484
485 @classmethod
TypeError: Index(...) must be called with a collection of some kind, 'ABC' was passed
In [37]: minutes = pd.DataFrame(np.random.randint(0, 1440, size=(500000, 3)), columns=['A', 'B', 'C'])
In [38]: minutes
Out[38]:
A B C
0 563 1201 987
1 858 570 926
2 1235 277 60
3 597 1326 971
4 634 1286 1092
5 681 675 1085
6 489 433 1415
7 226 904 1351
8 593 547 414
9 1086 256 1070
10 137 827 188
11 1127 684 779
12 476 888 1041
13 398 205 865
14 531 1042 996
15 605 909 304
16 1178 1372 195
17 24 817 314
18 90 1078 1332
19 3 525 736
20 715 372 951
21 1111 842 231
22 397 809 1033
23 188 1353 790
24 308 220 244
25 586 804 664
26 31 1430 900
27 593 691 874
28 179 1320 633
29 1268 1380 663
... ... ... ...
499970 997 404 499
499971 106 93 606
499972 1130 1187 1155
499973 163 1287 416
499974 261 88 1111
499975 1335 1357 969
499976 711 1301 213
499977 1369 705 882
499978 536 697 1205
499979 709 1011 868
499980 120 494 1314
499981 604 1323 381
499982 1398 598 1292
499983 16 1244 1091
499984 249 1008 426
499985 1 181 386
499986 123 1277 1140
499987 531 870 1086
499988 372 906 353
499989 545 824 489
499990 1127 1422 436
499991 994 188 552
499992 835 1343 609
499993 905 1430 483
499994 124 1260 1159
499995 139 1 48
499996 829 277 542
499997 870 1208 1173
499998 1083 599 527
499999 1057 940 87
[500000 rows x 3 columns]
In [39]: start_date = '04-24-2016'
In [40]: end_data = '04-24-2017'
In [41]: date_index = pd.date_range(start_date, end_date)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-41-e81a21d8cc55> in <module>()
----> 1 date_index = pd.date_range(start_date, end_date)
NameError: name 'end_date' is not defined
In [42]: end_date = '04-24-2017'
In [43]: date_index = pd.date_range(start_date, end_date)
In [44]: date_index
Out[44]:
DatetimeIndex(['2016-04-24', '2016-04-25', '2016-04-26', '2016-04-27',
'2016-04-28', '2016-04-29', '2016-04-30', '2016-05-01',
'2016-05-02', '2016-05-03',
...
'2017-04-15', '2017-04-16', '2017-04-17', '2017-04-18',
'2017-04-19', '2017-04-20', '2017-04-21', '2017-04-22',
'2017-04-23', '2017-04-24'],
dtype='datetime64[ns]', length=366, freq='D')
In [45]: current_data = pd.Series({'04-28-2016': 6, '05-28-2016': 5, '12-27-2016': 7, '04-23-2017': 5})
In [46]: current_data
Out[46]:
04-23-2017 5
04-28-2016 6
05-28-2016 5
12-27-2016 7
dtype: int64
In [47]: current_data.index = pd.DateTimeIndex(current_data.index)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-47-0fd208f594aa> in <module>()
----> 1 current_data.index = pd.DateTimeIndex(current_data.index)
AttributeError: 'module' object has no attribute 'DateTimeIndex'
In [48]: current_data.index = pd.DatetimeIndex(current_data.index)
In [49]: current_Data
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-49-6c816f2643d3> in <module>()
----> 1 current_Data
NameError: name 'current_Data' is not defined
In [50]: current_data
Out[50]:
2017-04-23 5
2016-04-28 6
2016-05-28 5
2016-12-27 7
dtype: int64
In [51]: current_data = current_data.reindex(date_index, fill_value=0)
In [52]: current_data
Out[52]:
2016-04-24 0
2016-04-25 0
2016-04-26 0
2016-04-27 0
2016-04-28 6
2016-04-29 0
2016-04-30 0
2016-05-01 0
2016-05-02 0
2016-05-03 0
2016-05-04 0
2016-05-05 0
2016-05-06 0
2016-05-07 0
2016-05-08 0
2016-05-09 0
2016-05-10 0
2016-05-11 0
2016-05-12 0
2016-05-13 0
2016-05-14 0
2016-05-15 0
2016-05-16 0
2016-05-17 0
2016-05-18 0
2016-05-19 0
2016-05-20 0
2016-05-21 0
2016-05-22 0
2016-05-23 0
..
2017-03-26 0
2017-03-27 0
2017-03-28 0
2017-03-29 0
2017-03-30 0
2017-03-31 0
2017-04-01 0
2017-04-02 0
2017-04-03 0
2017-04-04 0
2017-04-05 0
2017-04-06 0
In [54]: # column = value; parity = 1/0 where 1 if data is OK; output remove parity and set column to NaN if necessary
In [55]: cleaning = pd.DataFame.from_dict({'values
File "<ipython-input-55-2eb6a38dc553>", line 1
cleaning = pd.DataFame.from_dict({'values
^
SyntaxError: EOL while scanning string literal
In [56]: cleaning = pd.DataFame.from_dict({'values': [30, 20, 40, 50, 60], 'parity': [1, 0, 1, 1, 0]})
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-56-60e27d20fc3d> in <module>()
----> 1 cleaning = pd.DataFame.from_dict({'values': [30, 20, 40, 50, 60], 'parity': [1, 0, 1, 1, 0]})
AttributeError: 'module' object has no attribute 'DataFame'
In [57]: cleaning = pd.DataFrame.from_dict({'values': [30, 20, 40, 50, 60], 'parity
': [1, 0, 1, 1, 0]})
In [58]: cleaning
Out[58]:
parity values
0 1 30
1 0 20
2 1 40
3 1 50
4 0 60
In [59]: cleaning['values'][cleaning.parity == 1]
Out[59]:
0 30
2 40
3 50
Name: values, dtype: int64
In [60]: cleaning['values'][cleaning.parity == 1] = None
In [61]: cleaning
Out[61]:
parity values
0 1 NaN
1 0 20.0
2 1 NaN
3 1 NaN
4 0 60.0
In [62]: cleaning = cleaning.drop('partiy', 1)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-62-392d092b74f7> in <module>()
----> 1 cleaning = cleaning.drop('partiy', 1)
/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in drop(self, labels, axis, level, inplace, errors)
1875 new_axis = axis.drop(labels, level=level, errors=errors)
1876 else:
-> 1877 new_axis = axis.drop(labels, errors=errors)
1878 dropped = self.reindex(**{axis_name: new_axis})
1879 try:
/usr/local/lib/python2.7/site-packages/pandas/indexes/base.pyc in drop(self, labels, errors)
3049 if errors != 'ignore':
3050 raise ValueError('labels %s not contained in axis' %
-> 3051 labels[mask])
3052 indexer = indexer[~mask]
3053 return self.delete(indexer)
ValueError: labels ['partiy'] not contained in axis
In [63]: cleaning = cleaning.drop('parity', 1)
In [64]: cleaning
Out[64]:
values
0 NaN
1 20.0
2 NaN
3 NaN
4 60.0
In [65]: cleaning = pd.DataFrame.from_dict({'values': [30, 20, 40, 50, 60], 'parity': [1, 0, 1, 1, 0]})
In [66]: cleaning['values'][cleaning.parity == 1]
Out[66]:
0 30
2 40
3 50
Name: values, dtype: int64
In [67]: cleaning.loc[cleaning.parity == 1, 'values']
Out[67]:
0 30
2 40
3 50
Name: values, dtype: int64
In [68]: %timeit cleaning['values'][cleaning.parity == 1]
1000 loops, best of 3: 582 µs per loop
In [69]: %timeit cleaning.loc[cleaning.parity == 1, 'values']
1000 loops, best of 3: 314 µs per loop
In [70]: type(cleaning.loc[cleaning.parity == 1, 'values'])
Out[70]: pandas.core.series.Series
In [71]: type(cleaning.loc[cleaning.parity == 1])
Out[71]: pandas.core.frame.DataFrame
In [72]: class AwesomeData:
....: def __init__(self, data):
....: self.df = pd.DataFrame(data)
....:
In [73]: a_data = AwesomeData([1, 2, 3, 4])
In [74]: a_data.df.my_special_df_function()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-74-83fbe86d639f> in <module>()
----> 1 a_data.df.my_special_df_function()
/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in __getattr__(self, name)
2670 if name in self._info_axis:
2671 return self[name]
-> 2672 return object.__getattribute__(self, name)
2673
2674 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'my_special_df_function'
In [75]: a_data.my_special_df_function()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-75-40ac7c0b9cef> in <module>()
----> 1 a_data.my_special_df_function()
AttributeError: AwesomeData instance has no attribute 'my_special_df_function'
In [76]: class CoolData():
....: def __init__(self,data):
....: self.df = pd.DataFrame(data)
....: def get_size(self):
....: return self.df.shape
....:
In [77]: a_data = CoolData([1, 2, 3, 4])
In [78]: a_data.get_size()
Out[78]: (4, 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment