Skip to content

Instantly share code, notes, and snippets.

View ramnov's full-sized avatar

Ramkumar Chandrasekaran ramnov

View GitHub Profile
@ramnov
ramnov / Data_Preparation_1.sql
Last active September 27, 2016 22:04
Load csv data into a sql table
CREATE PROCEDURE [dbo].[LoadData]
AS
BEGIN
DROP TABLE IF EXISTS [dbo].[LoanStats]
CREATE TABLE [dbo].[LoanStats](
[id] [int] NULL,
[member_id] [int] NULL,
[loan_amnt] [int] NULL,
[funded_amnt] [int] NULL,
[funded_amnt_inv] [int] NULL,
@ramnov
ramnov / Data_Preparation_2.sql
Created September 27, 2016 21:16
Clean the Loan Data
--DROP unnecessary column desc
ALTER TABLE [dbo].[LoanStats] DROP COLUMN [desc]
--Remove % from int_rate and convert its type to float
UPDATE [dbo].[LoanStats] SET [int_rate] = REPLACE([int_rate], '%', '')
ALTER TABLE [dbo].[LoanStats] ALTER COLUMN [int_rate] float
--Remove % from revol_util and convert its type to float
UPDATE [dbo].[LoanStats] SET [revol_util] = REPLACE([revol_util], '%', '')
ALTER TABLE [dbo].[LoanStats] ALTER COLUMN [revol_util] float
@ramnov
ramnov / Feature_Selection.sql
Created September 27, 2016 21:22
Stored Procedure to plot distribution
CREATE PROCEDURE [dbo].[PlotDistribution]
AS
BEGIN
SET NOCOUNT ON;
EXECUTE sp_execute_external_script
@language = N'R',
@script = N'
library("reshape2")
library("ggplot2")
@ramnov
ramnov / Data_Partition.sql
Last active September 29, 2016 06:06
Stored Procedure to split the data into train(75%) and test(25%)
CREATE PROCEDURE [dbo].[SplitLoans]
AS
BEGIN
SET NOCOUNT ON;
-- 75% Training data
DROP TABLE IF EXISTS [dbo].[LoanStatsTrain]
SELECT * INTO [dbo].[LoanStatsTrain] FROM (SELECT * FROM [dbo].[LoanStats] WHERE (ABS(CAST((BINARY_CHECKSUM(id, NEWID())) as int)) % 100) < 75)a
-- 25% Test data
DROP TABLE IF EXISTS [dbo].[LoanStatsTest]
SELECT * INTO [dbo].[LoanStatsTest] FROM (SELECT * FROM [dbo].[LoanStats] WHERE [id] NOT IN (SELECT [id] FROM [dbo].[LoanStatsTrain]))a
@ramnov
ramnov / Modelling.sql
Last active September 29, 2016 06:14
Build a Random Forest Model and store it in a sql table
CREATE PROCEDURE [dbo].[BuildModel]
AS
BEGIN
DECLARE @inquery nvarchar(max) = N'SELECT * FROM [dbo].[LoanStatsTrain]'
DROP TABLE IF EXISTS [dbo].[models]
CREATE TABLE [dbo].[models]([model] [varbinary](max) NOT NULL)
INSERT INTO [dbo].[models]
EXEC sp_execute_external_script
@ramnov
ramnov / Scoring.sql
Created September 27, 2016 21:39
Stored Procedure to score test data using RandomForest Model
CREATE PROCEDURE [dbo].[ScoreLoans]
AS
BEGIN
DECLARE @inquery nvarchar(max) = N'SELECT * FROM [dbo].[LoanStatsTest]'
DECLARE @model varbinary(max) = (SELECT TOP 1 model FROM models)
DROP TABLE IF EXISTS [dbo].[LoanStatsPredictions]
CREATE TABLE [dbo].[LoanStatsPredictions]([is_bad_Pred] [float] NULL, [id] [int] NULL)
INSERT INTO [dbo].[LoanStatsPredictions]
@ramnov
ramnov / Model_Evaluation.sql
Last active September 29, 2016 06:11
Stored Procedure to plot ROC Curve and calculate AROC
CREATE PROCEDURE [dbo].[PlotROCCurve]
AS
BEGIN
EXEC sp_execute_external_script
@language = N'R',
@script = N'
suppressMessages(library("ROCR"))
# create output directory
mainDir <- ''C:\\temp\\plots''
@ramnov
ramnov / SummaryInDataset.R
Last active November 17, 2016 20:00
Create rollup/aggregate variables in the same dataset
install.packages("devtools")
install.packages("dplyr")
library("devtools")
library("dplyr")
install_github("RevolutionAnalytics/dplyrXdf")
airline_demo_xdf_file <- file.path(rxGetOption("sampleDataDir"), "AirlineDemoSmall.xdf")
airline <- RxXdfData(airline_demo_xdf_file)
smry <- airline %>% group_by(DayOfWeek) %>%
summarise(DayOfWeekMeans=mean(CRSDepTime),
DayOfWeekStdDev=sd(CRSDepTime),
@ramnov
ramnov / AirlineDemoSmall.R
Created November 6, 2016 21:55
Print the first several rows in AirlineDemoSmall Dataset
airline_demo_xdf_file <- file.path(rxGetOption("sampleDataDir"), "AirlineDemoSmall.xdf")
airline_demo_xdf <- RxXdfData(airline_demo_xdf_file)
airline_demo_data <- rxImport(airline_demo_xdf)
head(airline_demo_data)
@ramnov
ramnov / MinMax.R
Last active November 18, 2016 19:22
Columnar operations like Min/Max of rows/columns in xdf file
airline_demo_xdf_file <- file.path(rxGetOption("sampleDataDir"), "AirlineDemoSmall.xdf")
# Find min/max value of a particular column CRSDepTime in airlineXdfData
single_column_minmax <- rxDataStep(airline_demo_xdf_file, transformFunc=function(varlst) {
rng <- range(varlst$CRSDepTime, na.rm = TRUE)
.min <<- min(rng[1], .min)
.max <<- max(rng[2], .max)
NULL
},
transformVars="CRSDepTime",