Mageshwaran R magesh-technovator

## preprocess_text.py
def tokenize_lemma_stopwords(text):
	tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens)
	tokens = [t for t in tokens if t.isalpha()] # keep strings with only alphabets
	tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
	tokens = [stemmer.stem(t) for t in tokens]
	tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
	tokens = [t for t in tokens if t not in stopwords] # remove stopwords

	return tokens

## text2bow.py
# Create a dictionary for vocabulary words with it's index and count
dictionary = gensim.corpora.Dictionary(X)

# filter words that occurs in less than 5 documents and words that occurs in more than 50% of total documents
# keep top 100000 frequent words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

# create bag-of-words ==> list(index, count) for words in doctionary
bow_corpus = [dictionary.doc2bow(doc) for doc in X]

## tf_idf.py
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorised_train_documents = vectorizer.fit_transform(cleanedTrainData["content"])
vectorised_test_documents = vectorizer.transform(cleanedTestData["content"])

## word_frequency_dist.py
from yellowbrick.text import FreqDistVisualizer
features = vectorizer.get_feature_names()
visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(vectorised_train_documents)
visualizer.show()

## umap.py
from yellowbrick.text import UMAPVisualizer
from sklearn.cluster import KMeans

umap = UMAPVisualizer(metric="cosine")
umap.fit(vectorised_train_documents)
umap.show()

## fac_segmentation_data_exploration.py
import os
import cv2
from PIL import Image
import json
import shutil

# path where the dataset is downloaded
inputPath = r"../../../headsegmentation_dataset_ccncsa/samples/"
labelsPath = r"../../../headsegmentation_dataset_ccncsa/labels"

## detect_and_remove_duplicates.py
from imutils import paths
import numpy as np
import argparse
import cv2
import os

def dhash(image, hashSize=8):
    # convert the image to grayscale and resize the grayscale image,
    # adding a single column (width) so we can compute the horizontal
    # gradient

## subsample.py
# current folder
allFilesPath = r"../../faceSegmentation/dataset/"
# folder to store sub samples
subSamplesPath = r"../../dataset/subsamples/"

# Iterate through images and mask folder
# Check whether every image has a mask label
# Count number of samples in each category
oldFolders = {}
for imageFile, maskFile in zip(os.listdir(os.path.join(allFilesPath, "images")), os.listdir(os.path.join(allFilesPath, "masks"))):

## datahandler.py
from torch.utils.data import Dataset, DataLoader
import glob
import os
import numpy as np
import cv2
import torch
from torchvision import transforms, utils
from PIL import Image


## load_model.py
from torchvision import models
from torchvision.models.segmentation.deeplabv3 import DeepLabHead

def createDeepLabv3(outputchannels=1):
    model = models.segmentation.deeplabv3_resnet101(
        pretrained=True, progress=True)
    # Added a Tanh activation after the last convolution layer
    model.classifier = DeepLabHead(2048, outputchannels)
    # Set the model in training mode
    model.train()
	def tokenize_lemma_stopwords(text):
	tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens)
	tokens = [t for t in tokens if t.isalpha()] # keep strings with only alphabets
	tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
	tokens = [stemmer.stem(t) for t in tokens]
	tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
	tokens = [t for t in tokens if t not in stopwords] # remove stopwords

	return tokens
	# Create a dictionary for vocabulary words with it's index and count
	dictionary = gensim.corpora.Dictionary(X)

	# filter words that occurs in less than 5 documents and words that occurs in more than 50% of total documents
	# keep top 100000 frequent words
	dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

	# create bag-of-words ==> list(index, count) for words in doctionary
	bow_corpus = [dictionary.doc2bow(doc) for doc in X]
	from sklearn.feature_extraction.text import TfidfVectorizer

	vectorizer = TfidfVectorizer()
	vectorised_train_documents = vectorizer.fit_transform(cleanedTrainData["content"])
	vectorised_test_documents = vectorizer.transform(cleanedTestData["content"])
	from yellowbrick.text import FreqDistVisualizer
	features = vectorizer.get_feature_names()
	visualizer = FreqDistVisualizer(features=features, orient='v')
	visualizer.fit(vectorised_train_documents)
	visualizer.show()
	from yellowbrick.text import UMAPVisualizer
	from sklearn.cluster import KMeans

	umap = UMAPVisualizer(metric="cosine")
	umap.fit(vectorised_train_documents)
	umap.show()
	import os
	import cv2
	from PIL import Image
	import json
	import shutil

	# path where the dataset is downloaded
	inputPath = r"../../../headsegmentation_dataset_ccncsa/samples/"
	labelsPath = r"../../../headsegmentation_dataset_ccncsa/labels"
	from imutils import paths
	import numpy as np
	import argparse
	import cv2
	import os

	def dhash(image, hashSize=8):
	# convert the image to grayscale and resize the grayscale image,
	# adding a single column (width) so we can compute the horizontal
	# gradient
	# current folder
	allFilesPath = r"../../faceSegmentation/dataset/"
	# folder to store sub samples
	subSamplesPath = r"../../dataset/subsamples/"

	# Iterate through images and mask folder
	# Check whether every image has a mask label
	# Count number of samples in each category
	oldFolders = {}
	for imageFile, maskFile in zip(os.listdir(os.path.join(allFilesPath, "images")), os.listdir(os.path.join(allFilesPath, "masks"))):
	from torch.utils.data import Dataset, DataLoader
	import glob
	import os
	import numpy as np
	import cv2
	import torch
	from torchvision import transforms, utils
	from PIL import Image
	from torchvision import models
	from torchvision.models.segmentation.deeplabv3 import DeepLabHead

	def createDeepLabv3(outputchannels=1):
	model = models.segmentation.deeplabv3_resnet101(
	pretrained=True, progress=True)
	# Added a Tanh activation after the last convolution layer
	model.classifier = DeepLabHead(2048, outputchannels)
	# Set the model in training mode
	model.train()