Skip to content

Instantly share code, notes, and snippets.

@rikturr
rikturr / libsvm_to_numpy.py
Created March 31, 2018 18:17
Convert a folder of libsvm txt files into a sparse scipy array and save in .npz format
from glob import glob
import argparse
import os
import scipy.sparse as sp
import numpy as np
from sklearn.datasets import load_svmlight_file
def parse_args():
parser = argparse.ArgumentParser()
@rikturr
rikturr / csv_to_h5.py
Created October 17, 2018 15:23
Convert a large CSV file with binary class label to HDF5 file. Helpful if CSV is too big to fit in memory, HDF5 allows for indexing straight from file
import pandas as pd
from datetime import datetime
CHUNK_SIZE = 1000000
POS_KEY = 'positive'
NEG_KEY = 'negative'
CLASS_COLUMN = 'class'
FILE = '<FILEPATH>'
OUTFILE = '<OUTPATH>'
@rikturr
rikturr / load_pandas.py
Created July 21, 2020 14:28
load pandas
import pandas as pd
import numpy as np
taxi = pd.read_csv(
's3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv',
parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
).sample(frac=0.1, replace=False)
@rikturr
rikturr / pandas_features.py
Created July 21, 2020 14:30
pandas create features
taxi['pickup_weekday'] = taxi.tpep_pickup_datetime.dt.weekday
taxi['pickup_weekofyear'] = taxi.tpep_pickup_datetime.dt.weekofyear
taxi['pickup_hour'] = taxi.tpep_pickup_datetime.dt.hour
taxi['pickup_minute'] = taxi.tpep_pickup_datetime.dt.minute
taxi['pickup_year_seconds'] = (taxi.tpep_pickup_datetime - datetime.datetime(2019, 1, 1, 0, 0, 0)).dt.seconds
taxi['pickup_week_hour'] = (taxi.pickup_weekday * 24) + taxi.pickup_hour
taxi['passenger_count'] = taxi.passenger_count.astype(float).fillna(-1)
taxi = taxi.fillna(value={'VendorID': 'missing', 'RatecodeID': 'missing', 'store_and_fwd_flag': 'missing' })
# keep track of column names for pipeline steps
@rikturr
rikturr / scikit_grid_search.py
Created July 21, 2020 14:32
scikit grid search
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline(steps=[
('preprocess', ColumnTransformer(transformers=[
('num', StandardScaler(), numeric_feat),
('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_feat),
@rikturr
rikturr / run_grid.py
Created July 21, 2020 14:33
run grid search!
grid_search.fit(taxi[features], taxi[y_col])
print(grid_search.best_score_)
@rikturr
rikturr / init_dask.py
Created July 21, 2020 14:34
init dask
from dask.distributed import Client
from dask_saturn import SaturnCluster
cluster = SaturnCluster(n_workers=20)
client = Client(cluster)
@rikturr
rikturr / load_dask.py
Created July 21, 2020 14:34
load dask
import dask.dataframe as dd
taxi = dd.read_csv(
's3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv',
parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
).sample(frac=0.1, replace=False)
@rikturr
rikturr / init_spark.py
Created July 21, 2020 14:36
init spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
taxi = spark.read.csv('s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv',
header=True,
inferSchema=True,
timestampFormat='yyyy-MM-dd HH:mm:ss',
).sample(fraction=0.1, withReplacement=False)
@rikturr
rikturr / spark_features.py
Created July 21, 2020 14:37
spark features
import pyspark.sql.functions as F
import pyspark.sql.types as T
taxi = taxi.withColumn('pickup_weekday', F.dayofweek(taxi.tpep_pickup_datetime).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_weekofyear', F.weekofyear(taxi.tpep_pickup_datetime).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_hour', F.hour(taxi.tpep_pickup_datetime).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_minute', F.minute(taxi.tpep_pickup_datetime).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_year_seconds',
(F.unix_timestamp(taxi.tpep_pickup_datetime) -
F.unix_timestamp(F.lit(datetime.datetime(2019, 1, 1, 0, 0, 0)))).cast(T.DoubleType()))