Visualizing C programs from the Codechef database.
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import networkx as nx | |
import pickle | |
from collections import deque | |
import multiprocessing as mp | |
import editdistance | |
import os | |
from sklearn.decomposition import PCA, KernelPCA | |
from sklearn.manifold import TSNE | |
from tqdm import tqdm | |
samplesize = 1000 | |
def wrapper(args): | |
s1, s2 = args | |
return editdistance.eval(s1, s2) | |
if not os.path.exists('distmatrix.csv'): | |
# ---------------------------------------------------------------------------------------- | |
# Read the dataset | |
# ---------------------------------------------------------------------------------------- | |
sols = pd.read_csv('solutions.csv', usecols=['QCode', 'SolutionID', 'Status', 'Language']) | |
sols = sols.loc[sols.Language == 'C'].drop('Language', axis=1).dropna() # Only C | |
sols = sols.loc[(sols.Status == 'accepted') | (sols.Status == 'wrong answer')] | |
sols = sols.dropna().sample(samplesize*3) # Thrice the sample size we need | |
#sols.Status = (sols.Status == 'accepted').astype(int) | |
f, s, t = [pd.read_csv('code/' + i + '.csv') for i in ['first', 'second', 'third']] | |
print('Concatenating solutions') | |
c = pd.concat([f, s, t]); del(f); del(s); del(t) | |
print('Merging with dataset') | |
df = sols.merge(c, how='left', on='SolutionID'); del(c); del(sols) | |
print(df.info()) | |
# ---------------------------------------------------------------------------------------- | |
# Create keyword structure | |
# ---------------------------------------------------------------------------------------- | |
def to_keywords_only(code): | |
C_keys = '''auto,break,case,char,const,continue,default,do,double,else,enum,extern,float, | |
for,goto,if,int,long,register,return,short,signed,sizeof,static,struct,switch,typedef, | |
union,unsigned,void,volatile,while'''.replace('\\n', '').split(',') | |
special = '`1234567890-=+_)(*&^%$#@!~[]{}\|";:/?.>,<' + "'" | |
if isinstance(code, str): | |
code = code.replace('\n', ' ') | |
for sp in special: | |
code = code.replace(sp, ' {} '.format(sp)) | |
useful = ' '.join((word for word in code.split(' ') if word in (C_keys + list(special)))) | |
return useful | |
else: | |
return '' | |
tqdm.pandas(desc='applying_structure', ncols=100) | |
df['Structure'] = df.Solutions.progress_apply(to_keywords_only) | |
print(df.info()) | |
# ---------------------------------------------------------------------------------------- | |
# Take a sample from the dataset and calculate distance matrix | |
# ---------------------------------------------------------------------------------------- | |
print('Taking {} programs as sample'.format(samplesize)) | |
sample = df.dropna().sample(samplesize).dropna().copy() | |
sample.to_csv('sample.csv', index=False) | |
del(df) # We no longer need df | |
print('Calculating Distance Matrix') | |
distmatrix = deque() | |
with mp.Pool() as pool: | |
args = ((s1, s2) for s1 in sample.Structure.values for s2 in sample.Structure.values) | |
work = pool.imap_unordered(wrapper, args) | |
for value in tqdm(work, ncols=100, total=len(sample)**2): | |
distmatrix.append(value) | |
print('Saving distmatrix to disk') | |
distmatrix = pd.DataFrame({'s1':[x for x in sample.SolutionID.values for y in sample.SolutionID.values], | |
's2':[y for x in sample.SolutionID.values for y in sample.SolutionID.values], | |
'distance':distmatrix}) | |
distmatrix['s1n'] = distmatrix.s1.str[1:].astype(int) | |
distmatrix['s2n'] = distmatrix.s2.str[1:].astype(int) | |
distmatrix.to_csv('distmatrix.csv', index=False) | |
sample = pd.read_csv('sample.csv') | |
distmatrix = pd.read_csv('distmatrix.csv') | |
print('Done') | |
# ---------------------------------------------------------------------------------------- | |
# Calculate graph positions on x, y plane | |
# ---------------------------------------------------------------------------------------- | |
print(distmatrix.info()) | |
# Now we locate x, y locations for the points on the graph. | |
print('Making graph') | |
G = nx.Graph() | |
G.add_nodes_from(list(set(distmatrix['s1n']))) | |
def gen_edges(distmatrix): | |
for _, val in distmatrix[['s1n', 's2n', 'distance']].iterrows(): | |
weight = val.values[0] | |
s1 = val.values[1] | |
s2 = val.values[2] | |
yield (s1, s2, {'weight': weight}) | |
G.add_edges_from(gen_edges(distmatrix[['s1n','s2n','distance']])) | |
print('Calculating X, Y positions') | |
pos = {key: val for key, val in nx.spring_layout(G).items()} | |
sample['solidno'] = sample.SolutionID.str[1:].astype(int) | |
sample['xy'] = sample.solidno.map(pos) | |
sample['x'] = sample.xy.str[0].astype(float) | |
sample['y'] = sample.xy.str[1].astype(float) | |
sample = sample.drop('xy', axis=1) | |
print(sample.info()) | |
sample.to_csv('sample.csv', index=False) | |
sample = pd.read_csv('sample.csv') | |
# ---------------------------------------------------------------------------------------- | |
# Plot the figure | |
# ---------------------------------------------------------------------------------------- | |
print('Plotting figure') | |
size = 7 | |
sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False, col='Status') | |
plt.savefig("split.png") # save as png | |
sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False) | |
plt.savefig("single.png") # save as png | |
#plt.plot(df.x, df.y, 'o', alpha=0.5) | |
print('Done') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment