rich-hart/collate_feud_csv.py

## collate_feud_csv.py
from networkx.algorithms.components import connected_components
import networkx as nx
import random
from fuzzywuzzy import fuzz
import numpy as np
import csv

RANDOMIZE = False
#import ipdb; ipdb.set_trace()
with open('responces.csv', newline='\n') as csvfile:
     #answer_clusters = {}
     responce_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
     headers = next(responce_reader)
     responces = { h :[] for h in headers}
     for row in responce_reader:
         for i in range(len(headers)):
             header = headers[i]
             item = row[i]
             item = item.strip().lower()
             if item:
                 responces[header].append(item)


#     answers = [ a.lower().strip() for r in answer_reader for a in r if a]
collated_answers = {}
for question, answers in responces.items():
    ratio_matrix = np.zeros((len(answers),len(answers)))
    final_tally = { a:0 for a in answers}
    skip_ans = set()
    skip_index = set()
    for i, ans_a in enumerate(answers):
        for j, ans_b in enumerate(answers):

            ans_b = answers[j]
            ratio = fuzz.ratio(ans_a,ans_b)
            ratio_matrix[i,j] = ratio
            if ratio > 60:
                #if ans_a not in final_tally:
                final_tally[ans_a] +=  1
               #else:
               #final_tally[ans_a] += 1
                skip_index.add(j)
                skip_ans.add(ans_b)
                break
    graph_matrix = (ratio_matrix > 70) * 1
#G = nx.Graph()
    G = nx.to_networkx_graph(graph_matrix, create_using=nx.Graph)
    #print(final_tally)
    compontents = [ a for a in connected_components(G)]
    clusters = {}
    for comp in compontents:
        for node in comp:
            answer = answers[node]
            clusters[answer] = len(comp)
            #clusters[answer] = int(len(comp) *100/ len(G.nodes))
            break

    #pass
    clusters_str = '\n'
    clusters = list(clusters.items())
    clusters.sort(key=lambda x:x[1],reverse=True)
    #clusters = clusters[:12]
    #new_total = sum([c[1] for c in clusters])
    #clusters = [(c[0],int(c[1]*100/new_total)) for c in clusters]
    collated_answers[question]=clusters
    for k,v in clusters:
        row_str = f"{k}\t\t{v}\n"
        clusters_str = clusters_str + row_str
    print(question)
    print(clusters_str)

import ipdb; ipdb.set_trace()
with open('collated_answers.csv','w+', newline='\n') as csvfile:
     writer = csv.writer(csvfile, delimiter=',')
     for question, answers in collated_answers.items():
         writer.writerow(['*****'])
         writer.writerow([question])
         for answer, count in answers:
             writer.writerow([answer,count])
#    answers_writer = csv.DictWriter(csvfile,fieldnames=collated_answers.keys())
#    answers_writer.writeheader()
#    for q,a in collated_answers:

#keys = final_tally.keys()
#final_tally = [ (k,v) for k,v in final_tally.items()]
##final_tally.sort(key = lambda x: x[1])
#final_tally.sort(key = lambda x: x[1],reverse=True)
##final_tally = final_tally[:8]
##distinct_ratios = ratio_matrix  - np.identity(len(answers))*100
#if RANDOMIZE:
#    counts = np.array([x[1] + random.randint(0,5) for x in final_tally])
#else:
#    counts = np.array([x[1] + random.randint(0,4) for x in final_tally])
##counts = np.array([x[1] for x in final_tally])
#total = sum(counts)
#percent = [c * 100 / total for c in counts]
#
#sums = sum(ratio_matrix > 60)
##answer_scores = list(zip(keys,percent))
##answer_scores.sort(key = lambda x: x[1],reverse=True)
##print(answer_scores)
#final_counts = [ a for a in zip(answers,sums) if a[1]]
#final_counts.sort(key=lambda x: x[1],reverse=True)
#final_counts = final_counts[:12]
#if RANDOMIZE:
#    final_counts = [(x[0],x[1] + random.randint(0,5)) for x in final_counts]
#else:
#    final_counts = [(x[0],x[1]) for x in final_counts]
#
#total = sum([x[1] for x in final_counts])
#
#final_counts = [(x[0],int(x[1]*100/total)) for x in final_counts]
#
#final_counts = dict(final_counts)
#
#print(final_counts)
	from networkx.algorithms.components import connected_components
	import networkx as nx
	import random
	from fuzzywuzzy import fuzz
	import numpy as np
	import csv

	RANDOMIZE = False
	#import ipdb; ipdb.set_trace()
	with open('responces.csv', newline='\n') as csvfile:
	#answer_clusters = {}
	responce_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	headers = next(responce_reader)
	responces = { h :[] for h in headers}
	for row in responce_reader:
	for i in range(len(headers)):
	header = headers[i]
	item = row[i]
	item = item.strip().lower()
	if item:
	responces[header].append(item)



	# answers = [ a.lower().strip() for r in answer_reader for a in r if a]
	collated_answers = {}
	for question, answers in responces.items():
	ratio_matrix = np.zeros((len(answers),len(answers)))
	final_tally = { a:0 for a in answers}
	skip_ans = set()
	skip_index = set()
	for i, ans_a in enumerate(answers):
	for j, ans_b in enumerate(answers):

	ans_b = answers[j]
	ratio = fuzz.ratio(ans_a,ans_b)
	ratio_matrix[i,j] = ratio
	if ratio > 60:
	#if ans_a not in final_tally:
	final_tally[ans_a] += 1
	#else:
	#final_tally[ans_a] += 1
	skip_index.add(j)
	skip_ans.add(ans_b)
	break
	graph_matrix = (ratio_matrix > 70) * 1
	#G = nx.Graph()
	G = nx.to_networkx_graph(graph_matrix, create_using=nx.Graph)
	#print(final_tally)
	compontents = [ a for a in connected_components(G)]
	clusters = {}
	for comp in compontents:
	for node in comp:
	answer = answers[node]
	clusters[answer] = len(comp)
	#clusters[answer] = int(len(comp) *100/ len(G.nodes))
	break

	#pass
	clusters_str = '\n'
	clusters = list(clusters.items())
	clusters.sort(key=lambda x:x[1],reverse=True)
	#clusters = clusters[:12]
	#new_total = sum([c[1] for c in clusters])
	#clusters = [(c[0],int(c[1]*100/new_total)) for c in clusters]
	collated_answers[question]=clusters
	for k,v in clusters:
	row_str = f"{k}\t\t{v}\n"
	clusters_str = clusters_str + row_str
	print(question)
	print(clusters_str)

	import ipdb; ipdb.set_trace()
	with open('collated_answers.csv','w+', newline='\n') as csvfile:
	writer = csv.writer(csvfile, delimiter=',')
	for question, answers in collated_answers.items():
	writer.writerow(['*****'])
	writer.writerow([question])
	for answer, count in answers:
	writer.writerow([answer,count])
	# answers_writer = csv.DictWriter(csvfile,fieldnames=collated_answers.keys())
	# answers_writer.writeheader()
	# for q,a in collated_answers:

	#keys = final_tally.keys()
	#final_tally = [ (k,v) for k,v in final_tally.items()]
	##final_tally.sort(key = lambda x: x[1])
	#final_tally.sort(key = lambda x: x[1],reverse=True)
	##final_tally = final_tally[:8]
	##distinct_ratios = ratio_matrix - np.identity(len(answers))*100
	#if RANDOMIZE:
	# counts = np.array([x[1] + random.randint(0,5) for x in final_tally])
	#else:
	# counts = np.array([x[1] + random.randint(0,4) for x in final_tally])
	##counts = np.array([x[1] for x in final_tally])
	#total = sum(counts)
	#percent = [c * 100 / total for c in counts]
	#
	#sums = sum(ratio_matrix > 60)
	##answer_scores = list(zip(keys,percent))
	##answer_scores.sort(key = lambda x: x[1],reverse=True)
	##print(answer_scores)
	#final_counts = [ a for a in zip(answers,sums) if a[1]]
	#final_counts.sort(key=lambda x: x[1],reverse=True)
	#final_counts = final_counts[:12]
	#if RANDOMIZE:
	# final_counts = [(x[0],x[1] + random.randint(0,5)) for x in final_counts]
	#else:
	# final_counts = [(x[0],x[1]) for x in final_counts]
	#
	#total = sum([x[1] for x in final_counts])
	#
	#final_counts = [(x[0],int(x[1]*100/total)) for x in final_counts]
	#
	#final_counts = dict(final_counts)
	#
	#print(final_counts)