Chris van den Berg Bergvca

## stratified_sampling.R
len_pos <- nrow(example_dataset[example_dataset$target==1,])
len_neg <- nrow(example_dataset[example_dataset$target==0,])

train_model <- function(training_data, labels, model_type, ...) {
  experiment_control <- trainControl(method="repeatedcv",
                                     number = 10,
                                     repeats = 2,
                                     classProbs = T,
                                     summaryFunction = custom_summary_function)
  train(x = training_data,

## unnittestexample.py
import unittest
import os

from zipfile import ZipFile
from mock import MagicMock, patch, Mock, mock_open

# The functions that are tested:
def function_to_test_zipfile(example_arg):
    with ZipFile(example_arg, 'r') as zip_in:
        for input_file in zip_in.infolist():

## Name matching in SQL Server.sql
-- First create matches using a UDF, here I am using a combination of Jaro Winkler and (a normalized version of) Levensthein
--
-- Input: cleaned_table: a table with "cleaned" names
-- Output: tmp_groups: a table with uid - group_id tuples. Each group_id contains all uid's that belong to names that match.


DROP TABLE #matches
SELECT  a.clean_Name,
        a.uid,
        b.clean_Name clean_name_2,

## Pyspark_LDA_Example.py
import findspark
findspark.init("[spark install location]")

import pyspark
import string
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.util import MLUtils
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover
	len_pos <- nrow(example_dataset[example_dataset$target==1,])
	len_neg <- nrow(example_dataset[example_dataset$target==0,])

	train_model <- function(training_data, labels, model_type, ...) {
	experiment_control <- trainControl(method="repeatedcv",
	number = 10,
	repeats = 2,
	classProbs = T,
	summaryFunction = custom_summary_function)
	train(x = training_data,
	import unittest
	import os

	from zipfile import ZipFile
	from mock import MagicMock, patch, Mock, mock_open

	# The functions that are tested:
	def function_to_test_zipfile(example_arg):
	with ZipFile(example_arg, 'r') as zip_in:
	for input_file in zip_in.infolist():
	-- First create matches using a UDF, here I am using a combination of Jaro Winkler and (a normalized version of) Levensthein
	--
	-- Input: cleaned_table: a table with "cleaned" names
	-- Output: tmp_groups: a table with uid - group_id tuples. Each group_id contains all uid's that belong to names that match.


	DROP TABLE #matches
	SELECT a.clean_Name,
	a.uid,
	b.clean_Name clean_name_2,
	import findspark
	findspark.init("[spark install location]")

	import pyspark
	import string
	from pyspark import SparkContext
	from pyspark.sql import SQLContext
	from pyspark.mllib.util import MLUtils
	from pyspark.sql.types import *
	from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover