Skip to content

Instantly share code, notes, and snippets.

$ python data.py --nrows 1000
@ADGEfficiency
ADGEfficiency / pd_subset.py
Last active December 4, 2020 09:07
Mistakes Data Scientists Make
data = pd.read_csv('data.csv', nrows=1000)
@ADGEfficiency
ADGEfficiency / cli.py
Last active December 4, 2020 09:04
Mistakes Data Scientists Make
# data.py
parser.add_argument('--nrows', nargs='?')
args = parser.parse_args()
data = pd.read_csv('data.csv', nrows=args.nrows)
print(f'loaded {data.shape[0]} rows')
nrows = 1000
data = pd.read_csv('data.csv', nrows=nrows)
@ADGEfficiency
ADGEfficiency / home.py
Last active December 4, 2020 08:29
Mistakes Data Scientists Make
import os
home = os.environ['HOME']
path = os.path.join(home, 'adg'))
os.makedirs(path, exist_ok=True)
np.save(path, data)
@ADGEfficiency
ADGEfficiency / target.py
Last active December 4, 2020 08:26
Mistakes Data Scientists Make
# bad
data.drop('target', axis=1)
# good
data = data.drop('target', axis=1)j
@ADGEfficiency
ADGEfficiency / curse.py
Last active December 4, 2020 08:20
Mistakes Data Scientists Make
import itertools
def calc_num_combinations(data):
return len(list(itertools.permutations(data, len(data))))
def test_calc_num_combinations():
test_data = (
((0, ), 1), ((0, 1), 2), ((0, 1, 2), 6)
)
for data, length in test_data:
@ADGEfficiency
ADGEfficiency / classification.py
Last active December 4, 2020 08:20
Mistakes Data Scientists Make
import pandas as pd
data = ['awake'] * 1000 + ['asleep'] * 500 + ['dreaming'] * 50
pd.Series(data).value_counts().plot(kind='bar')
@ADGEfficiency
ADGEfficiency / standardizer.py
Last active December 19, 2019 05:17
Mistakes Data Scientists Make
standardized = (data - np.mean(data)) / np.std(data)
@ADGEfficiency
ADGEfficiency / int_index.py
Created December 19, 2019 05:14
Mistakes Data Scientists Make
data = data[:1000]