Skip to content

Instantly share code, notes, and snippets.

View violivei's full-sized avatar

Victor Oliveira Antonino violivei

View GitHub Profile
@violivei
violivei / evaluation.py
Created April 22, 2018 18:08
Evaluation for Multi-Label Classification
from pyspark.mllib.evaluation import MultilabelMetrics
scoreAndLabels = sc.parallelize([
([0.0, 1.0, 1.0], [1.0, 0.0, 0.0]),
([0.0, 0.0, 1.0], [0.0, 0.0, 1.0]),
([1.0, 0.0, 0.0], [1.0, 0.0, 0.0]),
([0.0, 1.0, 0.0], [0.0, 1.0, 0.0])])
metrics = MultilabelMetrics(scoreAndLabels)
@violivei
violivei / classifier.py
Created April 22, 2018 14:57
Prediction for Multi-Label Classification
def transform(self, df):
"""
Make predictions for each instance
:param df: dataframe with a `features` column
:type df: pyspark.sql.DataFrame
:return: prediction vectors for each instance
:rtype: pyspark.sql.DataFrame
"""
model_preds = []
for i, model in enumerate(self.fitted_mlc.stages[0].models):
@violivei
violivei / classifier.py
Last active April 22, 2018 17:11
Training for Multi-Label Classification
from pyspark.ml.linalg import DenseVector
from pyspark.ml.classification import OneVsRest, OneVsRestModel
from pyspark.ml.pipeline import Pipeline, PipelineModel
from pyspark.sql.functions import (
udf,
lit,
monotonically_increasing_id,
collect_list,
desc
)
@violivei
violivei / preprocess.py
Last active April 22, 2018 17:08
Preprocessing for Multi-Label Classification
def preprocess(df):
"""
Prepare data for multi-label classifier
:param df: dataframe with features and labels for each instance
:type df: pyspark.sql.DataFrame
"""
cat_cols = df.schema.fieldNames()[:-1]
indexers = [
StringIndexer(