Skip to content

Instantly share code, notes, and snippets.

@rich-hart
Created January 17, 2020 06:33
Show Gist options
  • Save rich-hart/ea4b56b4dea453f907abb47c98196473 to your computer and use it in GitHub Desktop.
Save rich-hart/ea4b56b4dea453f907abb47c98196473 to your computer and use it in GitHub Desktop.
from networkx.algorithms.components import connected_components
import networkx as nx
import random
from fuzzywuzzy import fuzz
import numpy as np
import csv
RANDOMIZE = False
#import ipdb; ipdb.set_trace()
with open('responces.csv', newline='\n') as csvfile:
#answer_clusters = {}
responce_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
headers = next(responce_reader)
responces = { h :[] for h in headers}
for row in responce_reader:
for i in range(len(headers)):
header = headers[i]
item = row[i]
item = item.strip().lower()
if item:
responces[header].append(item)
# answers = [ a.lower().strip() for r in answer_reader for a in r if a]
collated_answers = {}
for question, answers in responces.items():
ratio_matrix = np.zeros((len(answers),len(answers)))
final_tally = { a:0 for a in answers}
skip_ans = set()
skip_index = set()
for i, ans_a in enumerate(answers):
for j, ans_b in enumerate(answers):
ans_b = answers[j]
ratio = fuzz.ratio(ans_a,ans_b)
ratio_matrix[i,j] = ratio
if ratio > 60:
#if ans_a not in final_tally:
final_tally[ans_a] += 1
#else:
#final_tally[ans_a] += 1
skip_index.add(j)
skip_ans.add(ans_b)
break
graph_matrix = (ratio_matrix > 70) * 1
#G = nx.Graph()
G = nx.to_networkx_graph(graph_matrix, create_using=nx.Graph)
#print(final_tally)
compontents = [ a for a in connected_components(G)]
clusters = {}
for comp in compontents:
for node in comp:
answer = answers[node]
clusters[answer] = len(comp)
#clusters[answer] = int(len(comp) *100/ len(G.nodes))
break
#pass
clusters_str = '\n'
clusters = list(clusters.items())
clusters.sort(key=lambda x:x[1],reverse=True)
#clusters = clusters[:12]
#new_total = sum([c[1] for c in clusters])
#clusters = [(c[0],int(c[1]*100/new_total)) for c in clusters]
collated_answers[question]=clusters
for k,v in clusters:
row_str = f"{k}\t\t{v}\n"
clusters_str = clusters_str + row_str
print(question)
print(clusters_str)
import ipdb; ipdb.set_trace()
with open('collated_answers.csv','w+', newline='\n') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for question, answers in collated_answers.items():
writer.writerow(['*****'])
writer.writerow([question])
for answer, count in answers:
writer.writerow([answer,count])
# answers_writer = csv.DictWriter(csvfile,fieldnames=collated_answers.keys())
# answers_writer.writeheader()
# for q,a in collated_answers:
#keys = final_tally.keys()
#final_tally = [ (k,v) for k,v in final_tally.items()]
##final_tally.sort(key = lambda x: x[1])
#final_tally.sort(key = lambda x: x[1],reverse=True)
##final_tally = final_tally[:8]
##distinct_ratios = ratio_matrix - np.identity(len(answers))*100
#if RANDOMIZE:
# counts = np.array([x[1] + random.randint(0,5) for x in final_tally])
#else:
# counts = np.array([x[1] + random.randint(0,4) for x in final_tally])
##counts = np.array([x[1] for x in final_tally])
#total = sum(counts)
#percent = [c * 100 / total for c in counts]
#
#sums = sum(ratio_matrix > 60)
##answer_scores = list(zip(keys,percent))
##answer_scores.sort(key = lambda x: x[1],reverse=True)
##print(answer_scores)
#final_counts = [ a for a in zip(answers,sums) if a[1]]
#final_counts.sort(key=lambda x: x[1],reverse=True)
#final_counts = final_counts[:12]
#if RANDOMIZE:
# final_counts = [(x[0],x[1] + random.randint(0,5)) for x in final_counts]
#else:
# final_counts = [(x[0],x[1]) for x in final_counts]
#
#total = sum([x[1] for x in final_counts])
#
#final_counts = [(x[0],int(x[1]*100/total)) for x in final_counts]
#
#final_counts = dict(final_counts)
#
#print(final_counts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment