This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def tokenize_lemma_stopwords(text): | |
| tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens) | |
| tokens = [t for t in tokens if t.isalpha()] # keep strings with only alphabets | |
| tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form | |
| tokens = [stemmer.stem(t) for t in tokens] | |
| tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful | |
| tokens = [t for t in tokens if t not in stopwords] # remove stopwords | |
| return tokens |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Create a dictionary for vocabulary words with it's index and count | |
| dictionary = gensim.corpora.Dictionary(X) | |
| # filter words that occurs in less than 5 documents and words that occurs in more than 50% of total documents | |
| # keep top 100000 frequent words | |
| dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000) | |
| # create bag-of-words ==> list(index, count) for words in doctionary | |
| bow_corpus = [dictionary.doc2bow(doc) for doc in X] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| vectorizer = TfidfVectorizer() | |
| vectorised_train_documents = vectorizer.fit_transform(cleanedTrainData["content"]) | |
| vectorised_test_documents = vectorizer.transform(cleanedTestData["content"]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from yellowbrick.text import FreqDistVisualizer | |
| features = vectorizer.get_feature_names() | |
| visualizer = FreqDistVisualizer(features=features, orient='v') | |
| visualizer.fit(vectorised_train_documents) | |
| visualizer.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from yellowbrick.text import UMAPVisualizer | |
| from sklearn.cluster import KMeans | |
| umap = UMAPVisualizer(metric="cosine") | |
| umap.fit(vectorised_train_documents) | |
| umap.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import cv2 | |
| from PIL import Image | |
| import json | |
| import shutil | |
| # path where the dataset is downloaded | |
| inputPath = r"../../../headsegmentation_dataset_ccncsa/samples/" | |
| labelsPath = r"../../../headsegmentation_dataset_ccncsa/labels" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from imutils import paths | |
| import numpy as np | |
| import argparse | |
| import cv2 | |
| import os | |
| def dhash(image, hashSize=8): | |
| # convert the image to grayscale and resize the grayscale image, | |
| # adding a single column (width) so we can compute the horizontal | |
| # gradient |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # current folder | |
| allFilesPath = r"../../faceSegmentation/dataset/" | |
| # folder to store sub samples | |
| subSamplesPath = r"../../dataset/subsamples/" | |
| # Iterate through images and mask folder | |
| # Check whether every image has a mask label | |
| # Count number of samples in each category | |
| oldFolders = {} | |
| for imageFile, maskFile in zip(os.listdir(os.path.join(allFilesPath, "images")), os.listdir(os.path.join(allFilesPath, "masks"))): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from torch.utils.data import Dataset, DataLoader | |
| import glob | |
| import os | |
| import numpy as np | |
| import cv2 | |
| import torch | |
| from torchvision import transforms, utils | |
| from PIL import Image | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from torchvision import models | |
| from torchvision.models.segmentation.deeplabv3 import DeepLabHead | |
| def createDeepLabv3(outputchannels=1): | |
| model = models.segmentation.deeplabv3_resnet101( | |
| pretrained=True, progress=True) | |
| # Added a Tanh activation after the last convolution layer | |
| model.classifier = DeepLabHead(2048, outputchannels) | |
| # Set the model in training mode | |
| model.train() |
OlderNewer