Skip to content

Instantly share code, notes, and snippets.

@yifeihuang
Last active September 16, 2020 04:58
Show Gist options
  • Save yifeihuang/c4161e35f475d290f400f950efbc7499 to your computer and use it in GitHub Desktop.
Save yifeihuang/c4161e35f475d290f400f950efbc7499 to your computer and use it in GitHub Desktop.
[ER] scoring iteration model
human_label = spark.read.csv("YOUR_STORAGE_PATH/candidate_pair_sample_LABELED.csv")\
.filter(f.col('human_label').isNotNull())\
.distinct()
feature_df = distance_df.filter(f.col('overall_sim') > 0.06)\
.withColumn('rules_label',
f.when((f.col('name_tfidf_sim') >= 0.999) | (f.col('overall_sim') >= 0.999), 1)
.when(f.col('overall_sim') < 0.12, 0)
.otherwise(None)
)\
.withColumn('src_id', f.col('edge.src'))\
.withColumn('dst_id', f.col('edge.dst'))\
.withColumn('diff_source', (f.col('src.source') != f.col('dst.source')).cast('integer'))\
.join(human_labels, ['src_id', 'dst_id'], 'left')\
.withColumn('label', f.coalesce(f.col('rules_label'), f.col('human_label')).cast('integer'))
# assemble the vectors to make a final feature set
features = ['manufacturer_lev', 'description_lev', 'name_lev', 'price_sim',
'name_tfidf_sim', 'description_tfidf_sim', 'manufacturer_tfidf_sim',
'name_token_sim', 'manufacturer_token_sim', 'description_token_sim',
'name_encoding_sim', 'description_encoding_sim', 'diff_source']
feature_df = feature_df.withColumn('features', f.array(*[f.col(c) for c in features]))
training_set = feature_df.select('src_id', 'dst_id', 'features', 'label')\
.filter(f.col('label').isNotNull())
training_collected = training_set.collect()
# rf
param_grid = {'n_estimators': [50, 75, 100], 'max_depth': [11, 12, 13], 'max_features': ['log2', 'sqrt', None]}
rf = ensemble.RandomForestClassifier(random_state=42)
gs_rf = GridSearchCV(rf, param_grid, scoring='balanced_accuracy', n_jobs=-1, verbose=3, return_train_score=True)
X_train = [r['features'] for r in training_collected]
Y_train = [r['label'] for r in training_collected]
gs_rf.fit(X_train, Y_train)
print(gs_rf.best_params_)
print(gs_rf.best_score_)
feat_importance = list(zip(features, gs_rf.best_estimator_.feature_importances_))
feat_importance.sort(key=lambda x: x[1], reverse=True)
print(feat_importance)
from sklearn import metrics
from sklearn.metrics import plot_precision_recall_curve, plot_roc_curve
y_pred = gs_rf.best_estimator_.predict(X_train)
pnr = plot_precision_recall_curve(gs_rf.best_estimator_, X_train, Y_train)
roc = plot_roc_curve(gs_rf.best_estimator_, X_train, Y_train)
print(metrics.classification_report(Y_train, y_pred))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment