Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Visualizing C programs from the Codechef database.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import pickle
from collections import deque
import multiprocessing as mp
import editdistance
import os
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from tqdm import tqdm
samplesize = 1000
def wrapper(args):
s1, s2 = args
return editdistance.eval(s1, s2)
if not os.path.exists('distmatrix.csv'):
# ----------------------------------------------------------------------------------------
# Read the dataset
# ----------------------------------------------------------------------------------------
sols = pd.read_csv('solutions.csv', usecols=['QCode', 'SolutionID', 'Status', 'Language'])
sols = sols.loc[sols.Language == 'C'].drop('Language', axis=1).dropna() # Only C
sols = sols.loc[(sols.Status == 'accepted') | (sols.Status == 'wrong answer')]
sols = sols.dropna().sample(samplesize*3) # Thrice the sample size we need
#sols.Status = (sols.Status == 'accepted').astype(int)
f, s, t = [pd.read_csv('code/' + i + '.csv') for i in ['first', 'second', 'third']]
print('Concatenating solutions')
c = pd.concat([f, s, t]); del(f); del(s); del(t)
print('Merging with dataset')
df = sols.merge(c, how='left', on='SolutionID'); del(c); del(sols)
print(df.info())
# ----------------------------------------------------------------------------------------
# Create keyword structure
# ----------------------------------------------------------------------------------------
def to_keywords_only(code):
C_keys = '''auto,break,case,char,const,continue,default,do,double,else,enum,extern,float,
for,goto,if,int,long,register,return,short,signed,sizeof,static,struct,switch,typedef,
union,unsigned,void,volatile,while'''.replace('\\n', '').split(',')
special = '`1234567890-=+_)(*&^%$#@!~[]{}\|";:/?.>,<' + "'"
if isinstance(code, str):
code = code.replace('\n', ' ')
for sp in special:
code = code.replace(sp, ' {} '.format(sp))
useful = ' '.join((word for word in code.split(' ') if word in (C_keys + list(special))))
return useful
else:
return ''
tqdm.pandas(desc='applying_structure', ncols=100)
df['Structure'] = df.Solutions.progress_apply(to_keywords_only)
print(df.info())
# ----------------------------------------------------------------------------------------
# Take a sample from the dataset and calculate distance matrix
# ----------------------------------------------------------------------------------------
print('Taking {} programs as sample'.format(samplesize))
sample = df.dropna().sample(samplesize).dropna().copy()
sample.to_csv('sample.csv', index=False)
del(df) # We no longer need df
print('Calculating Distance Matrix')
distmatrix = deque()
with mp.Pool() as pool:
args = ((s1, s2) for s1 in sample.Structure.values for s2 in sample.Structure.values)
work = pool.imap_unordered(wrapper, args)
for value in tqdm(work, ncols=100, total=len(sample)**2):
distmatrix.append(value)
print('Saving distmatrix to disk')
distmatrix = pd.DataFrame({'s1':[x for x in sample.SolutionID.values for y in sample.SolutionID.values],
's2':[y for x in sample.SolutionID.values for y in sample.SolutionID.values],
'distance':distmatrix})
distmatrix['s1n'] = distmatrix.s1.str[1:].astype(int)
distmatrix['s2n'] = distmatrix.s2.str[1:].astype(int)
distmatrix.to_csv('distmatrix.csv', index=False)
sample = pd.read_csv('sample.csv')
distmatrix = pd.read_csv('distmatrix.csv')
print('Done')
# ----------------------------------------------------------------------------------------
# Calculate graph positions on x, y plane
# ----------------------------------------------------------------------------------------
print(distmatrix.info())
# Now we locate x, y locations for the points on the graph.
print('Making graph')
G = nx.Graph()
G.add_nodes_from(list(set(distmatrix['s1n'])))
def gen_edges(distmatrix):
for _, val in distmatrix[['s1n', 's2n', 'distance']].iterrows():
weight = val.values[0]
s1 = val.values[1]
s2 = val.values[2]
yield (s1, s2, {'weight': weight})
G.add_edges_from(gen_edges(distmatrix[['s1n','s2n','distance']]))
print('Calculating X, Y positions')
pos = {key: val for key, val in nx.spring_layout(G).items()}
sample['solidno'] = sample.SolutionID.str[1:].astype(int)
sample['xy'] = sample.solidno.map(pos)
sample['x'] = sample.xy.str[0].astype(float)
sample['y'] = sample.xy.str[1].astype(float)
sample = sample.drop('xy', axis=1)
print(sample.info())
sample.to_csv('sample.csv', index=False)
sample = pd.read_csv('sample.csv')
# ----------------------------------------------------------------------------------------
# Plot the figure
# ----------------------------------------------------------------------------------------
print('Plotting figure')
size = 7
sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False, col='Status')
plt.savefig("split.png") # save as png
sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False)
plt.savefig("single.png") # save as png
#plt.plot(df.x, df.y, 'o', alpha=0.5)
print('Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.