Skip to content

Instantly share code, notes, and snippets.

View kushalchauhan98's full-sized avatar
🔥
Setting TPUs on fire!

Kushal Chauhan kushalchauhan98

🔥
Setting TPUs on fire!
View GitHub Profile
@kushalchauhan98
kushalchauhan98 / email_summarization.py
Created August 1, 2018 16:40
A module for E-mail Summarization which uses clustering of skip-thought sentence embeddings.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Module for E-mail Summarization
*****************************************************************************
Input Parameters:
emails: A list of strings containing the emails
Returns:
summary: A list of strings containing the summaries.
*****************************************************************************
# clean() is a modified version of extract_signature() found in bruteforce.py in the GitHub repository linked above
cleaned_email, _ = clean(email)
lines = cleaned_email.split('\n')
lines = [line for line in lines if line != '']
cleaned_email = ' '.join(lines)
from talon.signature.bruteforce import extract_signature
cleaned_email, _ = extract_signature(email)
@kushalchauhan98
kushalchauhan98 / detect_lang.py
Created August 2, 2018 14:07
Detect Language
from langdetect import detect
lang = detect(cleaned_email) # lang = 'en' for an English email
@kushalchauhan98
kushalchauhan98 / split_sentences.py
Created August 2, 2018 14:08
Sentence Tokenizer
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(email, language = lang)
@kushalchauhan98
kushalchauhan98 / skipthought_encode.py
Created August 2, 2018 14:09
Skip-Thought Encoder
# The 'skipthoughts' module can be found at the root of the GitHub repository linked above
import skipthoughts
# You would need to download pre-trained models first
model = skipthoughts.load_model()
encoder = skipthoughts.Encoder(model)
encoded = encoder.encode(sentences)
import numpy as np
from sklearn.cluster import KMeans
n_clusters = np.ceil(len(encoded)**0.5)
kmeans = KMeans(n_clusters=n_clusters)
kmeans = kmeans.fit(encoded)
from sklearn.metrics import pairwise_distances_argmin_min
avg = []
for j in range(n_clusters):
idx = np.where(kmeans.labels_ == j)[0]
avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
summary = ' '.join([email[closest[idx]] for idx in ordering])
@kushalchauhan98
kushalchauhan98 / pad_packed_demo.py
Created May 20, 2019 12:33 — forked from Tushar-N/pad_packed_demo.py
How to use pad_packed_sequence in pytorch
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
seqs = ['gigantic_string','tiny_str','medium_str']
# make <pad> idx 0
vocab = ['<pad>'] + sorted(set(''.join(seqs)))
# make model