ygivenx/gzip-text-classification.py

## gzip-text-classification.py
import gzip
import numpy as np
import concurrent.futures

from sklearn.datasets import fetch_20newsgroups

# dataset
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# format for GZIP
training_set = np.array(list(zip(newsgroups_train.data, newsgroups_train.target)))
test_set =  np.array(list(zip(newsgroups_test.data, newsgroups_test.target)))


# model

def get_gzip_preds(item, k=10):
    # for i, ( x1 , target ) in enumerate(test_set):
    (x1, target) = item
    Cx1 = len( gzip . compress ( x1 . encode () ) )
    distance_from_x1 = []
    for ( x2 , _ ) in training_set :
        Cx2 = len(gzip.compress(x2.encode()))
        x1x2 = " ".join ([ x1 , x2 ])
        Cx1x2 = len( gzip.compress (x1x2.encode()))
        ncd = ( Cx1x2 - min ( Cx1 , Cx2 ) ) / max (Cx1 , Cx2 )
        distance_from_x1.append ( ncd )
    sorted_idx = np.argsort ( np.array (distance_from_x1 ) )
    top_k_class = list(training_set [ sorted_idx[: k ] , 1])
    predict_class = max(set( top_k_class ) , key = top_k_class.count )
    # print(f"# Pred : {predict_class}\t Actual: {target}")
    return predict_class


# parallel
with concurrent.futures.ProcessPoolExecutor() as executor:
    # Start the function for each item.
    futures = {executor.submit(get_gzip_preds, item): item for item in test_set}

    correct_predictions = 0
    total_predictions = 0
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        item = futures[future]
        total_predictions += 1

        if result == item[1]:  # Assuming item[1] is the actual class
            correct_predictions += 1

        # Calculate the accuracy
        accuracy = correct_predictions / total_predictions
        print(f'Accuracy so far: {accuracy}')

# Print final accuracy
print(f'Final accuracy: {accuracy}')
	import gzip
	import numpy as np
	import concurrent.futures

	from sklearn.datasets import fetch_20newsgroups

	# dataset
	newsgroups_train = fetch_20newsgroups(subset='train')
	newsgroups_test = fetch_20newsgroups(subset='test')

	# format for GZIP
	training_set = np.array(list(zip(newsgroups_train.data, newsgroups_train.target)))
	test_set = np.array(list(zip(newsgroups_test.data, newsgroups_test.target)))


	# model

	def get_gzip_preds(item, k=10):
	# for i, ( x1 , target ) in enumerate(test_set):
	(x1, target) = item
	Cx1 = len( gzip . compress ( x1 . encode () ) )
	distance_from_x1 = []
	for ( x2 , _ ) in training_set :
	Cx2 = len(gzip.compress(x2.encode()))
	x1x2 = " ".join ([ x1 , x2 ])
	Cx1x2 = len( gzip.compress (x1x2.encode()))
	ncd = ( Cx1x2 - min ( Cx1 , Cx2 ) ) / max (Cx1 , Cx2 )
	distance_from_x1.append ( ncd )
	sorted_idx = np.argsort ( np.array (distance_from_x1 ) )
	top_k_class = list(training_set [ sorted_idx[: k ] , 1])
	predict_class = max(set( top_k_class ) , key = top_k_class.count )
	# print(f"# Pred : {predict_class}\t Actual: {target}")
	return predict_class


	# parallel
	with concurrent.futures.ProcessPoolExecutor() as executor:
	# Start the function for each item.
	futures = {executor.submit(get_gzip_preds, item): item for item in test_set}

	correct_predictions = 0
	total_predictions = 0
	for future in concurrent.futures.as_completed(futures):
	result = future.result()
	item = futures[future]
	total_predictions += 1

	if result == item[1]: # Assuming item[1] is the actual class
	correct_predictions += 1

	# Calculate the accuracy
	accuracy = correct_predictions / total_predictions
	print(f'Accuracy so far: {accuracy}')

	# Print final accuracy
	print(f'Final accuracy: {accuracy}')