cs95 Coldsp33d

## create_df.py
import pandas as pd
import perfplot

def append(n):
  df = pd.DataFrame(columns=['A', 'B', 'C'])
  for _ in range(n):
    df = df.append({'A': 1, 'B': 12.3, 'C': 'xyz'}, ignore_index=True) # yuck
  return df

def list_append(n):

## list_interleave_benchmark.py
from itertools import chain
import perfplot

def cs1(l):
    def _cs1(l):
        for i, x in enumerate(l, 1):
            yield x
            yield f'{x}_{i}'

    return list(_cs1(l))

## ohe_truncate_cols.py
def load_data(datafile, encoder=None):
    data = pd.read_csv(datafile, header=0, low_memory=False)

    data_y = data[['job_performance']]
    data_x = data.drop(['job_performance'], axis=1)

    data_x.replace([np.inf, -np.inf], np.nan, inplace=True)
    data_x.fillna(data_x.mean(), inplace=True)

    if not encoder:

## conditional_col_creation_benchmark.py
import pandas as pd
import perfplot

def numpy_where(df):
  return df.assign(is_rich=np.where(df['salary'] >= 50, 'yes', 'no'))

def list_comp(df):
  return df.assign(is_rich=['yes' if x >= 50 else 'no' for x in df['salary']])

def loc(df):

## comment_flagger.py
import requests
import re
import pandas as pd
import html
import time
import sys

# https://stackapps.com/q/8364/

client_id = '15705'

## inplace_funcs.txt
# Reference: https://github.com/pandas-dev/pandas/issues/16529

(Series/DataFrame).drop
(Series/DataFrame).drop_duplicates
(Series/DataFrame).dropna
DataFrame.set_index (with drop=False wouldn't change the data, but that doesn't seem the main use case)
DataFrame.query
DataFrame.eval


## read_clipboard beginner's guide [DRAFT]
## Beginner's Guide to `pd.read_clipboard`

[`read_clipboard`](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#id45) is truly a saving grace for anyone starting out to answer questions in the [tag:pandas] tag. Unfortunately, pandas veterans also know that the data provided in questions isn't always easy to grok into a terminal due to various complication  such as MultiIndexes, spaces in header names, datetimes, and python objects.

Thankfully, `read_clipboard` has arguments that make handling most of these cases possible (and easy). The purpose of this answer is to document some of those cases in finer details.

---

### Spaces in column headers

## reverse_dataframe_benchmarks.py
import pandas as pd
import numpy as np
import perfplot

def slice(df):
    return df[::-1]

def slice_loc(df):
    return df.loc[::-1]


## shifted_calculations_benchmark.py
import pandas as pd
import numpy as np
import perfplot
import numba

def with_df_loc(df):
    df.at[0, 'b'] = 5

    for i in range(1, len(df)):
        df.at[i, 'b'] = (df.at[i - 1, 'a'] + df.at[i - 1, 'b']) / 2

## dict_column_explosion_benchmark.py
import pandas as pd
import numpy as np
import perfplot
from string import ascii_lowercase as LOWER, ascii_uppercase as UPPER
import random

# Note: The copy() calls are needed here because `pop()` mutates the dataframe inplace
# so it is essential to make a copy() we don't want to mutate the output across runs
def apply_drop(df):
    return df.join(df['val'].apply(pd.Series),).drop('val', axis=1)
	import pandas as pd
	import perfplot

	def append(n):
	df = pd.DataFrame(columns=['A', 'B', 'C'])
	for _ in range(n):
	df = df.append({'A': 1, 'B': 12.3, 'C': 'xyz'}, ignore_index=True) # yuck
	return df

	def list_append(n):
	from itertools import chain
	import perfplot

	def cs1(l):
	def _cs1(l):
	for i, x in enumerate(l, 1):
	yield x
	yield f'{x}_{i}'

	return list(_cs1(l))
	def load_data(datafile, encoder=None):
	data = pd.read_csv(datafile, header=0, low_memory=False)

	data_y = data[['job_performance']]
	data_x = data.drop(['job_performance'], axis=1)

	data_x.replace([np.inf, -np.inf], np.nan, inplace=True)
	data_x.fillna(data_x.mean(), inplace=True)

	if not encoder:
	import pandas as pd
	import perfplot

	def numpy_where(df):
	return df.assign(is_rich=np.where(df['salary'] >= 50, 'yes', 'no'))

	def list_comp(df):
	return df.assign(is_rich=['yes' if x >= 50 else 'no' for x in df['salary']])

	def loc(df):
	import requests
	import re
	import pandas as pd
	import html
	import time
	import sys

	# https://stackapps.com/q/8364/

	client_id = '15705'
	# Reference: https://github.com/pandas-dev/pandas/issues/16529

	(Series/DataFrame).drop
	(Series/DataFrame).drop_duplicates
	(Series/DataFrame).dropna
	DataFrame.set_index (with drop=False wouldn't change the data, but that doesn't seem the main use case)
	DataFrame.query
	DataFrame.eval
	## Beginner's Guide to `pd.read_clipboard`

	[`read_clipboard`](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#id45) is truly a saving grace for anyone starting out to answer questions in the [tag:pandas] tag. Unfortunately, pandas veterans also know that the data provided in questions isn't always easy to grok into a terminal due to various complication such as MultiIndexes, spaces in header names, datetimes, and python objects.

	Thankfully, `read_clipboard` has arguments that make handling most of these cases possible (and easy). The purpose of this answer is to document some of those cases in finer details.

	---

	### Spaces in column headers
	import pandas as pd
	import numpy as np
	import perfplot

	def slice(df):
	return df[::-1]

	def slice_loc(df):
	return df.loc[::-1]