Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Recommendation System application with medical drug dataset picked from https://www.kaggle.com/jessicali9530/kuc-hackathon-winter-2018. This dataset was used for the Winter 2018 Kaggle University Club Hackathon and is now publicly available.
#!/usr/bin/env python
# coding: utf-8
# Step 1
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
# Step 2
df = pd.read_csv('drugsComTest_raw.csv').fillna('NA')
df['condition_id'] = pd.Series(encoder.fit_transform(df['condition'].values), index=df.index)
df_medical = df.filter(['drugName', 'condition', 'rating', 'condition_id'], axis=1)
df_medical_ratings_pivot=df_medical.pivot_table(index='drugName',columns='condition_id',values='rating').fillna(0)
df_medical_ratings_pivot_matrix = csr_matrix(df_medical_ratings_pivot.values)
# Step 3
# distance = [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]
# algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute', 'cuml']
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df_medical_ratings_pivot_matrix)
# Step 4
sample_index = np.random.choice(df_medical_ratings_pivot.shape[0])
sample_condition = df_medical_ratings_pivot.iloc[sample_index,:].values.reshape(1, -1)
# Step 5
distances, indices = model_knn.kneighbors(sample_condition, n_neighbors = 6)
for i in range(0, len(distances.flatten())):
if i == 0:
print('Recommendations for {0}:\n'.format(df_medical_ratings_pivot.index[sample_index]))
else:
recommendation = df_medical_ratings_pivot.index[indices.flatten()[i]]
distanceFromSample = distances.flatten()[i]
print('{0}: {1}, with distance of {2}:'.format(i, recommendation, distanceFromSample))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment