Visualizing C programs from the Codechef database.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import networkx as nx | |
import pickle | |
from collections import deque | |
import multiprocessing as mp | |
import editdistance | |
import os | |
from sklearn.decomposition import PCA, KernelPCA | |
from sklearn.manifold import TSNE | |
from tqdm import tqdm | |
samplesize = 1000 | |
def wrapper(args): | |
s1, s2 = args | |
return editdistance.eval(s1, s2) | |
if not os.path.exists('distmatrix.csv'): | |
# ---------------------------------------------------------------------------------------- | |
# Read the dataset | |
# ---------------------------------------------------------------------------------------- | |
sols = pd.read_csv('solutions.csv', usecols=['QCode', 'SolutionID', 'Status', 'Language']) | |
sols = sols.loc[sols.Language == 'C'].drop('Language', axis=1).dropna() # Only C | |
sols = sols.loc[(sols.Status == 'accepted') | (sols.Status == 'wrong answer')] | |
sols = sols.dropna().sample(samplesize*3) # Thrice the sample size we need | |
#sols.Status = (sols.Status == 'accepted').astype(int) | |
f, s, t = [pd.read_csv('code/' + i + '.csv') for i in ['first', 'second', 'third']] | |
print('Concatenating solutions') | |
c = pd.concat([f, s, t]); del(f); del(s); del(t) | |
print('Merging with dataset') | |
df = sols.merge(c, how='left', on='SolutionID'); del(c); del(sols) | |
print(df.info()) | |
# ---------------------------------------------------------------------------------------- | |
# Create keyword structure | |
# ---------------------------------------------------------------------------------------- | |
def to_keywords_only(code): | |
C_keys = '''auto,break,case,char,const,continue,default,do,double,else,enum,extern,float, | |
for,goto,if,int,long,register,return,short,signed,sizeof,static,struct,switch,typedef, | |
union,unsigned,void,volatile,while'''.replace('\\n', '').split(',') | |
special = '`1234567890-=+_)(*&^%$#@!~[]{}\|";:/?.>,<' + "'" | |
if isinstance(code, str): | |
code = code.replace('\n', ' ') | |
for sp in special: | |
code = code.replace(sp, ' {} '.format(sp)) | |
useful = ' '.join((word for word in code.split(' ') if word in (C_keys + list(special)))) | |
return useful | |
else: | |
return '' | |
tqdm.pandas(desc='applying_structure', ncols=100) | |
df['Structure'] = df.Solutions.progress_apply(to_keywords_only) | |
print(df.info()) | |
# ---------------------------------------------------------------------------------------- | |
# Take a sample from the dataset and calculate distance matrix | |
# ---------------------------------------------------------------------------------------- | |
print('Taking {} programs as sample'.format(samplesize)) | |
sample = df.dropna().sample(samplesize).dropna().copy() | |
sample.to_csv('sample.csv', index=False) | |
del(df) # We no longer need df | |
print('Calculating Distance Matrix') | |
distmatrix = deque() | |
with mp.Pool() as pool: | |
args = ((s1, s2) for s1 in sample.Structure.values for s2 in sample.Structure.values) | |
work = pool.imap_unordered(wrapper, args) | |
for value in tqdm(work, ncols=100, total=len(sample)**2): | |
distmatrix.append(value) | |
print('Saving distmatrix to disk') | |
distmatrix = pd.DataFrame({'s1':[x for x in sample.SolutionID.values for y in sample.SolutionID.values], | |
's2':[y for x in sample.SolutionID.values for y in sample.SolutionID.values], | |
'distance':distmatrix}) | |
distmatrix['s1n'] = distmatrix.s1.str[1:].astype(int) | |
distmatrix['s2n'] = distmatrix.s2.str[1:].astype(int) | |
distmatrix.to_csv('distmatrix.csv', index=False) | |
sample = pd.read_csv('sample.csv') | |
distmatrix = pd.read_csv('distmatrix.csv') | |
print('Done') | |
# ---------------------------------------------------------------------------------------- | |
# Calculate graph positions on x, y plane | |
# ---------------------------------------------------------------------------------------- | |
print(distmatrix.info()) | |
# Now we locate x, y locations for the points on the graph. | |
print('Making graph') | |
G = nx.Graph() | |
G.add_nodes_from(list(set(distmatrix['s1n']))) | |
def gen_edges(distmatrix): | |
for _, val in distmatrix[['s1n', 's2n', 'distance']].iterrows(): | |
weight = val.values[0] | |
s1 = val.values[1] | |
s2 = val.values[2] | |
yield (s1, s2, {'weight': weight}) | |
G.add_edges_from(gen_edges(distmatrix[['s1n','s2n','distance']])) | |
print('Calculating X, Y positions') | |
pos = {key: val for key, val in nx.spring_layout(G).items()} | |
sample['solidno'] = sample.SolutionID.str[1:].astype(int) | |
sample['xy'] = sample.solidno.map(pos) | |
sample['x'] = sample.xy.str[0].astype(float) | |
sample['y'] = sample.xy.str[1].astype(float) | |
sample = sample.drop('xy', axis=1) | |
print(sample.info()) | |
sample.to_csv('sample.csv', index=False) | |
sample = pd.read_csv('sample.csv') | |
# ---------------------------------------------------------------------------------------- | |
# Plot the figure | |
# ---------------------------------------------------------------------------------------- | |
print('Plotting figure') | |
size = 7 | |
sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False, col='Status') | |
plt.savefig("split.png") # save as png | |
sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False) | |
plt.savefig("single.png") # save as png | |
#plt.plot(df.x, df.y, 'o', alpha=0.5) | |
print('Done') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment