Skip to content

Instantly share code, notes, and snippets.

View magesh-technovator's full-sized avatar
🎯
Focusing

Mageshwaran R magesh-technovator

🎯
Focusing
View GitHub Profile
@magesh-technovator
magesh-technovator / preprocess_text.py
Last active February 16, 2020 11:58
nlp_preprocessing
def tokenize_lemma_stopwords(text):
tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens)
tokens = [t for t in tokens if t.isalpha()] # keep strings with only alphabets
tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
tokens = [stemmer.stem(t) for t in tokens]
tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
tokens = [t for t in tokens if t not in stopwords] # remove stopwords
return tokens
@magesh-technovator
magesh-technovator / text2bow.py
Last active December 25, 2019 11:15
pre-processed text to bag of words with genism
# Create a dictionary for vocabulary words with it's index and count
dictionary = gensim.corpora.Dictionary(X)
# filter words that occurs in less than 5 documents and words that occurs in more than 50% of total documents
# keep top 100000 frequent words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
# create bag-of-words ==> list(index, count) for words in doctionary
bow_corpus = [dictionary.doc2bow(doc) for doc in X]
@magesh-technovator
magesh-technovator / tf_idf.py
Created February 16, 2020 13:37
TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorised_train_documents = vectorizer.fit_transform(cleanedTrainData["content"])
vectorised_test_documents = vectorizer.transform(cleanedTestData["content"])
@magesh-technovator
magesh-technovator / word_frequency_dist.py
Created February 16, 2020 13:54
Yellowbrick frequency distribution of words
from yellowbrick.text import FreqDistVisualizer
features = vectorizer.get_feature_names()
visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(vectorised_train_documents)
visualizer.show()
@magesh-technovator
magesh-technovator / umap.py
Created February 16, 2020 14:00
visualizing corpus with umap
from yellowbrick.text import UMAPVisualizer
from sklearn.cluster import KMeans
umap = UMAPVisualizer(metric="cosine")
umap.fit(vectorised_train_documents)
umap.show()
@magesh-technovator
magesh-technovator / fac_segmentation_data_exploration.py
Created October 23, 2020 06:08
face segmentation dataset exploration
import os
import cv2
from PIL import Image
import json
import shutil
# path where the dataset is downloaded
inputPath = r"../../../headsegmentation_dataset_ccncsa/samples/"
labelsPath = r"../../../headsegmentation_dataset_ccncsa/labels"
@magesh-technovator
magesh-technovator / detect_and_remove_duplicates.py
Created October 23, 2020 07:03
Near duplicates detection and removal
from imutils import paths
import numpy as np
import argparse
import cv2
import os
def dhash(image, hashSize=8):
# convert the image to grayscale and resize the grayscale image,
# adding a single column (width) so we can compute the horizontal
# gradient
# current folder
allFilesPath = r"../../faceSegmentation/dataset/"
# folder to store sub samples
subSamplesPath = r"../../dataset/subsamples/"
# Iterate through images and mask folder
# Check whether every image has a mask label
# Count number of samples in each category
oldFolders = {}
for imageFile, maskFile in zip(os.listdir(os.path.join(allFilesPath, "images")), os.listdir(os.path.join(allFilesPath, "masks"))):
@magesh-technovator
magesh-technovator / datahandler.py
Last active May 20, 2021 10:39
segmentation_DataLoader
from torch.utils.data import Dataset, DataLoader
import glob
import os
import numpy as np
import cv2
import torch
from torchvision import transforms, utils
from PIL import Image
@magesh-technovator
magesh-technovator / load_model.py
Last active May 20, 2021 10:40
Load DeepLabV3 model
from torchvision import models
from torchvision.models.segmentation.deeplabv3 import DeepLabHead
def createDeepLabv3(outputchannels=1):
model = models.segmentation.deeplabv3_resnet101(
pretrained=True, progress=True)
# Added a Tanh activation after the last convolution layer
model.classifier = DeepLabHead(2048, outputchannels)
# Set the model in training mode
model.train()