Skip to content

Instantly share code, notes, and snippets.

@ygivenx
Last active July 14, 2023 22:05
Show Gist options
  • Save ygivenx/1064d8913c33d1689c14ccb2b3467395 to your computer and use it in GitHub Desktop.
Save ygivenx/1064d8913c33d1689c14ccb2b3467395 to your computer and use it in GitHub Desktop.
Gzip Text classification Algorithm
import gzip
import numpy as np
import concurrent.futures
from sklearn.datasets import fetch_20newsgroups
# dataset
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
# format for GZIP
training_set = np.array(list(zip(newsgroups_train.data, newsgroups_train.target)))
test_set = np.array(list(zip(newsgroups_test.data, newsgroups_test.target)))
# model
def get_gzip_preds(item, k=10):
# for i, ( x1 , target ) in enumerate(test_set):
(x1, target) = item
Cx1 = len( gzip . compress ( x1 . encode () ) )
distance_from_x1 = []
for ( x2 , _ ) in training_set :
Cx2 = len(gzip.compress(x2.encode()))
x1x2 = " ".join ([ x1 , x2 ])
Cx1x2 = len( gzip.compress (x1x2.encode()))
ncd = ( Cx1x2 - min ( Cx1 , Cx2 ) ) / max (Cx1 , Cx2 )
distance_from_x1.append ( ncd )
sorted_idx = np.argsort ( np.array (distance_from_x1 ) )
top_k_class = list(training_set [ sorted_idx[: k ] , 1])
predict_class = max(set( top_k_class ) , key = top_k_class.count )
# print(f"# Pred : {predict_class}\t Actual: {target}")
return predict_class
# parallel
with concurrent.futures.ProcessPoolExecutor() as executor:
# Start the function for each item.
futures = {executor.submit(get_gzip_preds, item): item for item in test_set}
correct_predictions = 0
total_predictions = 0
for future in concurrent.futures.as_completed(futures):
result = future.result()
item = futures[future]
total_predictions += 1
if result == item[1]: # Assuming item[1] is the actual class
correct_predictions += 1
# Calculate the accuracy
accuracy = correct_predictions / total_predictions
print(f'Accuracy so far: {accuracy}')
# Print final accuracy
print(f'Final accuracy: {accuracy}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment