Skip to content

Instantly share code, notes, and snippets.

Avatar

Laura Langdon LauraLangdon

View GitHub Profile
View gist:58707f6bc579b6f4c64a717571535737
def majority_vote(tweet_vector, train_set, k) -> str:
"""
Count how many of the k-NN tweets were written by Trump or not-Trump,
and return whichever is larger
:param tweet_vector: vector of given tweet
:param train_set: training set
:param k: desired number of nearest neighbors
:return: Whether tweet was authored by Trump, not Trump, or draw
View gist:e06d551e715ecd3774b9d8b63ae4265c
def knn(tweet_vector, train_set, k) -> list:
"""
Find k nearest neighbors of a given tweet
:param tweet_vector: vector of tweet whose neighbors we seek
:param train_set: training set
:param k: desired number of nearest neighbors
:return: list of indices in main tweet list of k nearest neighbors, and distances of those
neighbors to given tweet
View gist:3844439fc0151be27dda160801a3ec20
def get_distance(tweet1_vector, tweet2_vector) -> int:
"""
Implement Minkowski distance metric
:param tweet1_vector: vector of first tweet
:param tweet2_vector: vector of second tweet
:return: Minkowski distance between tweets
"""
distance = 0
View gist:32ad8ba990a1fb74a2c20f59551cf590
def split_train_test(tweet_vectors, randomized_tweet_vectors) -> tuple:
"""
Split into train and test sets
:param tweet_vectors: tweets in vector form
:return: train_set, test_set tuple of train set and test set
"""
x_train_dim = math.floor(0.8 * tweet_vectors.shape[0]) # Use 80% of data for train set
x_test_dim = math.ceil(0.2 * tweet_vectors.shape[0]) # Use 20% of data for test set
View gist:4a75924ddc67bbe28b185976bc810ca9
def randomize_vectors(tweet_vectors):
"""
:param tweet_vectors:
:return: randomized_tweet_vectors: a Numpy array of tweet vectors that have
been randomly shuffled
"""
#Initialize randomized tweet vectors
randomized_tweet_vectors = np.zeros((tweet_vectors.shape[0], tweet_vectors.shape[1]), dtype=int)
View gist:a1a8e1deee02479a9c09892f326a51ef
def individual_tweet_vectorizer(corpus, tweet, index=0, author=''):
"""
Formats a single tweet as a vector
:param corpus: list of all words in tweets
:param tweet: tweet to be vectorized
:param index: index of tweet in main list of tweets
:param author: Trump or general
:return: Single tweet in vector form
View gist:633c1ce4213956562c1cfa452b5551df
def clean_text(corpus, input_string: str) -> list:
"""
Clean text data and add to corpus
:param corpus: list of all words in the data
:param input_string: string of words to be added to the corpus
:return: output_string_as_list: cleaned list of words from input string
"""
input_string = re.split(r'\W+', input_string)
View gist:a4cad4d4d13e9a3ec5b9a1a963e6d9da
def read_file(file_name: str, key_name='') -> list:
"""
Open and read csv.gz, .tsv, .csv, or JSON file; return as list
:param file_name: Name of file
:param key_name: Name of JSON key (optional)
:return: data_list: Data in list form
"""
View gist:2ac2d336e7257fd860a794bcd705269b
def csv_list_maker(data_file, delimiter=',') -> list:
"""
Turn data in csv form into list form
:param data_file: file containing the data
:param delimiter: character delimiting the data
:return: data_list: data in list form
"""
@LauraLangdon
LauraLangdon / gist:54a36a30993f85953fca6084bf89a3ca
Created Apr 1, 2020
AttributeError: 'AxesImage' object has no property 'unique'
View gist:54a36a30993f85953fca6084bf89a3ca
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-19-5ea46c4de867> in <module>
1 bears = bears.new(item_tfms=RandomResizedCrop(128, min_scale=0.3))
2 dls = bears.dataloaders(path)
----> 3 dls.train.show_batch(max_n=4, nrows=1, unique=True)
/opt/conda/envs/fastai/lib/python3.7/site-packages/fastai2/data/core.py in show_batch(self, b, max_n, ctxs, show, **kwargs)
90 if b is None: b = self.one_batch()
91 if not show: return self._pre_show_batch(b, max_n=max_n)