Skip to content

Instantly share code, notes, and snippets.

Avatar

cs95 Coldsp33d

  • Mountain View, CA
View GitHub Profile
View dict_column_explosion_benchmark.py
import pandas as pd
import numpy as np
import perfplot
from string import ascii_lowercase as LOWER, ascii_uppercase as UPPER
import random
# Note: The copy() calls are needed here because `pop()` mutates the dataframe inplace
# so it is essential to make a copy() we don't want to mutate the output across runs
def apply_drop(df):
return df.join(df['val'].apply(pd.Series),).drop('val', axis=1)
View shifted_calculations_benchmark.py
import pandas as pd
import numpy as np
import perfplot
import numba
def with_df_loc(df):
df.at[0, 'b'] = 5
for i in range(1, len(df)):
df.at[i, 'b'] = (df.at[i - 1, 'a'] + df.at[i - 1, 'b']) / 2
View reverse_dataframe_benchmarks.py
import pandas as pd
import numpy as np
import perfplot
def slice(df):
return df[::-1]
def slice_loc(df):
return df.loc[::-1]
View read_clipboard beginner's guide [DRAFT]
## Beginner's Guide to `pd.read_clipboard`
[`read_clipboard`](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#id45) is truly a saving grace for anyone starting out to answer questions in the [tag:pandas] tag. Unfortunately, pandas veterans also know that the data provided in questions isn't always easy to grok into a terminal due to various complication such as MultiIndexes, spaces in header names, datetimes, and python objects.
Thankfully, `read_clipboard` has arguments that make handling most of these cases possible (and easy). The purpose of this answer is to document some of those cases in finer details.
---
### Spaces in column headers
@Coldsp33d
Coldsp33d / inplace_funcs.txt
Last active Dec 9, 2019
List of functions that support in-place modification
View inplace_funcs.txt
# Reference: https://github.com/pandas-dev/pandas/issues/16529
(Series/DataFrame).drop
(Series/DataFrame).drop_duplicates
(Series/DataFrame).dropna
DataFrame.set_index (with drop=False wouldn't change the data, but that doesn't seem the main use case)
DataFrame.query
DataFrame.eval
@Coldsp33d
Coldsp33d / comment_flagger.py
Created Jul 31, 2019
Partially automated comment flagging on Stack Overflow.
View comment_flagger.py
import requests
import re
import pandas as pd
import html
import time
import sys
# https://stackapps.com/q/8364/
client_id = '15705'
View conditional_col_creation_benchmark.py
import pandas as pd
import perfplot
def numpy_where(df):
return df.assign(is_rich=np.where(df['salary'] >= 50, 'yes', 'no'))
def list_comp(df):
return df.assign(is_rich=['yes' if x >= 50 else 'no' for x in df['salary']])
def loc(df):
View ohe_truncate_cols.py
def load_data(datafile, encoder=None):
data = pd.read_csv(datafile, header=0, low_memory=False)
data_y = data[['job_performance']]
data_x = data.drop(['job_performance'], axis=1)
data_x.replace([np.inf, -np.inf], np.nan, inplace=True)
data_x.fillna(data_x.mean(), inplace=True)
if not encoder:
@Coldsp33d
Coldsp33d / list_interleave_benchmark.py
Created Jun 27, 2019
Interleave two or more lists
View list_interleave_benchmark.py
from itertools import chain
import perfplot
def cs1(l):
def _cs1(l):
for i, x in enumerate(l, 1):
yield x
yield f'{x}_{i}'
return list(_cs1(l))
@Coldsp33d
Coldsp33d / create_df.py
Last active Apr 11, 2021
Benchmarking different methods for creating empty dataframes from scratch
View create_df.py
import pandas as pd
import perfplot
def append(n):
df = pd.DataFrame(columns=['A', 'B', 'C'])
for _ in range(n):
df = df.append({'A': 1, 'B': 12.3, 'C': 'xyz'}, ignore_index=True) # yuck
return df
def list_append(n):