Kushal Chauhan kushalchauhan98

## email_summarization.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Module for E-mail Summarization
*****************************************************************************
Input Parameters:
    emails: A list of strings containing the emails
Returns:
    summary: A list of strings containing the summaries.
*****************************************************************************

## email_cleaner.py
# clean() is a modified version of extract_signature() found in bruteforce.py in the GitHub repository linked above
cleaned_email, _ = clean(email)

lines = cleaned_email.split('\n')
lines = [line for line in lines if line != '']
cleaned_email = ' '.join(lines)

## talon_cleaner.py
from talon.signature.bruteforce import extract_signature
cleaned_email, _ = extract_signature(email)

## detect_lang.py
from langdetect import detect
lang = detect(cleaned_email) # lang = 'en' for an English email

## split_sentences.py
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(email, language = lang)

## skipthought_encode.py
# The 'skipthoughts' module can be found at the root of the GitHub  repository linked above
import skipthoughts

# You would need to download pre-trained models first
model = skipthoughts.load_model()

encoder = skipthoughts.Encoder(model)
encoded =  encoder.encode(sentences)

## clustering.py
import numpy as np
from sklearn.cluster import KMeans

n_clusters = np.ceil(len(encoded)**0.5)
kmeans = KMeans(n_clusters=n_clusters)
kmeans = kmeans.fit(encoded)

## summarize.py
from sklearn.metrics import pairwise_distances_argmin_min

avg = []
for j in range(n_clusters):
    idx = np.where(kmeans.labels_ == j)[0]
    avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
summary = ' '.join([email[closest[idx]] for idx in ordering])

## pad_packed_demo.py
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

seqs = ['gigantic_string','tiny_str','medium_str']

# make <pad> idx 0
vocab = ['<pad>'] + sorted(set(''.join(seqs)))

# make model
	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	Module for E-mail Summarization
	*****************************************************************************
	Input Parameters:
	emails: A list of strings containing the emails
	Returns:
	summary: A list of strings containing the summaries.
	*****************************************************************************
	# clean() is a modified version of extract_signature() found in bruteforce.py in the GitHub repository linked above
	cleaned_email, _ = clean(email)

	lines = cleaned_email.split('\n')
	lines = [line for line in lines if line != '']
	cleaned_email = ' '.join(lines)
	from talon.signature.bruteforce import extract_signature
	cleaned_email, _ = extract_signature(email)
	from langdetect import detect
	lang = detect(cleaned_email) # lang = 'en' for an English email
	from nltk.tokenize import sent_tokenize
	sentences = sent_tokenize(email, language = lang)
	# The 'skipthoughts' module can be found at the root of the GitHub repository linked above
	import skipthoughts

	# You would need to download pre-trained models first
	model = skipthoughts.load_model()

	encoder = skipthoughts.Encoder(model)
	encoded = encoder.encode(sentences)
	import numpy as np
	from sklearn.cluster import KMeans

	n_clusters = np.ceil(len(encoded)**0.5)
	kmeans = KMeans(n_clusters=n_clusters)
	kmeans = kmeans.fit(encoded)
	from sklearn.metrics import pairwise_distances_argmin_min

	avg = []
	for j in range(n_clusters):
	idx = np.where(kmeans.labels_ == j)[0]
	avg.append(np.mean(idx))
	closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded)
	ordering = sorted(range(n_clusters), key=lambda k: avg[k])
	summary = ' '.join([email[closest[idx]] for idx in ordering])
	import torch
	import torch.nn as nn
	from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

	seqs = ['gigantic_string','tiny_str','medium_str']

	# make <pad> idx 0
	vocab = ['<pad>'] + sorted(set(''.join(seqs)))

	# make model