Skip to content

Instantly share code, notes, and snippets.

def mapper1(_, line):
for word in line:
yield ((len(word), word[0]), 1)
def reducer1(key, values):
yield key, sum(values)
def mapper2(key, value):
length, letter = key
yield (length, (value, letter))
mport threading
import time
from queue import Queue
import sys
from os import listdir
from collections import Counter
q = Queue()
directory = sys.argv[1]
files = listdir(directory)
from multiprocessing import Pool
from time import sleep
import numpy as np
def sum_list(numbers):
return sum(numbers)
master_list = np.arange(100000000).reshape(2,50000000)
l1 = [3, 5, 7]

SUMMARY: Hierarchical cluster analysis

  1. Hierarchical cluster analysis of n objects is defined by a stepwise algorithm which merges two objects at each step, the two which have the least dissimilarity.

  2. Dissimilarities between clusters of objects can be defined in several ways; for example, the maximum dissimilarity (complete linkage), minimum dissimilarity (single linkage) or average dissimilarity (average linkage).

  3. Either rows or columns of a matrix can be clustered – in each case we choose the

import pandas as pd
import unicodedata
import string
import numpy as np
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
docs = []
docs.append('code PYTHON code.')
def to_categorical(y, num_classes=None):
"""Converts a class vector (integers) to binary class matrix.
E.g. for use with categorical_crossentropy.
# Arguments
y: class vector to be converted into a matrix
(integers from 0 to num_classes).
num_classes: total number of classes.
# Returns
A binary matrix representation of the input.
"""
@michaelguia
michaelguia / poly.py
Created March 6, 2017 22:26
Polynomial features labeled in a dataframe
def PolynomialFeatures_labeled(input_df,power):
'''Basically this is a cover for the sklearn preprocessing function.
The problem with that function is if you give it a labeled dataframe, it ouputs an unlabeled dataframe with potentially
a whole bunch of unlabeled columns.
Inputs:
input_df = Your labeled pandas dataframe (list of x's not raised to any power)
power = what order polynomial you want variables up to. (use the same power as you want entered into pp.PolynomialFeatures(power) directly)
Ouput: