Pascal Potvin TheLoneNut

## multiprocessing_example3.py
def my_list_processing(l):
    if 666 in l: return 666/0
    return sum(l)

elements = [i for i in range(1000)]

results = []
def reducing(e):
    results.append(e)

## multiprocessing_example2.py
def my_list_processing(l):
    return sum(l)

elements = [i for i in range(1000)]

results = []
def reducing(e):
    results.append(e)

simpleMultiprocessing(elements, my_list_processing, reducing, verbose=True)

## multiprocessing_example1.py
def my_list_processing(l):
    return sum(l)

elements = [i for i in range(1000)]

results = []
simpleMultiprocessing(elements, my_list_processing, results.append, verbose=True)
print(results)

result = sum(results)

## mutiprocessing.py
class simpleMultiprocessing:
    '''
    This class makes multiprocessing easy.
    :param elements: A list of elements that can be split in smaller chunks and processed in parallel.
    :param f_map: A function which takes a list of elements (normally a sublist of "elements") and process it.
    :param f_reduce: [Optional] A callback function called each time f_map return from processing sublist of elements. The function takes the return value of f_map as input.
    :param nProcesses: [Optional] Number of processes to spawn, default is twice the number of available processors.
    :param verbose: [Optional] When set to True, displays the steps of multiprocessing.
    '''
    def __init__(self, elements, f_map, f_reduce=None, nProcesses=max(1, int(2.*float(os.getenv('CPU_LIMIT')))), verbose=True):

## bug.py
def append_y_words(y_words, base_list=[]):
    '''
    Purpose: Return a list of words from the base_list (if any) followed by
             words starting with 'y' from the words list.
    '''
    y_words = [word for word in y_words if word.startswith('y')]
    base_list += y_words
    return base_list

print append_y_words(["yoyo", "player"]) # should print ['yoyo']

## cluster_score.py
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import calinski_harabaz_score

num_clusters = range(10, 600, 10)
scores = []

for num_cluster in num_clusters:

    km = MiniBatchKMeans(n_clusters=num_cluster, init_size=max(300, 3*num_cluster)).fit(X)
    labels = km.labels_

## lda.py
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf1 = LinearDiscriminantAnalysis(n_components=2)
X1 = clf1.fit_transform(X, labels)

## clustering.py
from sklearn.cluster import MiniBatchKMeans

num_clusters = 80

kn = MiniBatchKMeans(n_clusters=num_clusters, init_size=max(300, 3*num_clusters)).fit(X)
labels = kn.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

## distance.py
X = pmi(df)
X = normalize(X, copy=False)
tree = BallTree(X, p=2)
knn_d, knn_ix = tree.query([X[<specific_example>]], k=10, return_distance=True)

## pmi.py
def pmi(df):
    '''
    Calculate the positive pointwise mutal information score for each entry
    https://en.wikipedia.org/wiki/Pointwise_mutual_information
    We use the log( p(y|x)/p(y) ), y being the column, x being the row
    '''
    # Get numpy array from pandas df
    arr = df.as_matrix()

    # p(y|x) probability of each t1 overlap within the row
	def my_list_processing(l):
	if 666 in l: return 666/0
	return sum(l)

	elements = [i for i in range(1000)]

	results = []
	def reducing(e):
	results.append(e)
	class simpleMultiprocessing:
	'''
	This class makes multiprocessing easy.
	:param elements: A list of elements that can be split in smaller chunks and processed in parallel.
	:param f_map: A function which takes a list of elements (normally a sublist of "elements") and process it.
	:param f_reduce: [Optional] A callback function called each time f_map return from processing sublist of elements. The function takes the return value of f_map as input.
	:param nProcesses: [Optional] Number of processes to spawn, default is twice the number of available processors.
	:param verbose: [Optional] When set to True, displays the steps of multiprocessing.
	'''
	def __init__(self, elements, f_map, f_reduce=None, nProcesses=max(1, int(2.*float(os.getenv('CPU_LIMIT')))), verbose=True):
	def append_y_words(y_words, base_list=[]):
	'''
	Purpose: Return a list of words from the base_list (if any) followed by
	words starting with 'y' from the words list.
	'''
	y_words = [word for word in y_words if word.startswith('y')]
	base_list += y_words
	return base_list

	print append_y_words(["yoyo", "player"]) # should print ['yoyo']
	from sklearn.cluster import MiniBatchKMeans
	from sklearn.metrics import calinski_harabaz_score

	num_clusters = range(10, 600, 10)
	scores = []

	for num_cluster in num_clusters:

	km = MiniBatchKMeans(n_clusters=num_cluster, init_size=max(300, 3*num_cluster)).fit(X)
	labels = km.labels_
	from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

	clf1 = LinearDiscriminantAnalysis(n_components=2)
	X1 = clf1.fit_transform(X, labels)
	from sklearn.cluster import MiniBatchKMeans

	num_clusters = 80

	kn = MiniBatchKMeans(n_clusters=num_clusters, init_size=max(300, 3*num_clusters)).fit(X)
	labels = kn.labels_
	n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
	X = pmi(df)
	X = normalize(X, copy=False)
	tree = BallTree(X, p=2)
	knn_d, knn_ix = tree.query([X[<specific_example>]], k=10, return_distance=True)
	def pmi(df):
	'''
	Calculate the positive pointwise mutal information score for each entry
	https://en.wikipedia.org/wiki/Pointwise_mutual_information
	We use the log( p(y\|x)/p(y) ), y being the column, x being the row
	'''
	# Get numpy array from pandas df
	arr = df.as_matrix()

	# p(y\|x) probability of each t1 overlap within the row