Skip to content

Instantly share code, notes, and snippets.

@Alex-Kopylov
Created February 7, 2019 14:40
Show Gist options
  • Save Alex-Kopylov/1017675379164d3e823fc57f06410ffd to your computer and use it in GitHub Desktop.
Save Alex-Kopylov/1017675379164d3e823fc57f06410ffd to your computer and use it in GitHub Desktop.
import time
import hashlib
t1 = time.time()
train_hashes = [hashlib.sha1(x).digest() for x in train_dataset]
valid_hashes = [hashlib.sha1(x).digest() for x in valid_dataset]
test_hashes = [hashlib.sha1(x).digest() for x in test_dataset]
valid_in_train = np.in1d(valid_hashes, train_hashes)
test_in_train = np.in1d(test_hashes, train_hashes)
test_in_valid = np.in1d(test_hashes, valid_hashes)
valid_keep = ~valid_in_train
test_keep = ~(test_in_train | test_in_valid)
valid_dataset_clean = valid_dataset[valid_keep]
valid_labels_clean = valid_labels [valid_keep]
test_dataset_clean = test_dataset[test_keep]
test_labels_clean = test_labels [test_keep]
t2 = time.time()
print("Time: %0.2fs" % (t2 - t1))
print("valid -> train overlap: %d samples" % valid_in_train.sum())
print("test -> train overlap: %d samples" % test_in_train.sum())
print("test -> valid overlap: %d samples" % test_in_valid.sum())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment