Skip to content

Instantly share code, notes, and snippets.

@theSage21
Created December 9, 2016 01:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save theSage21/580511162116e1dfaefc2db4bc047e69 to your computer and use it in GitHub Desktop.
Save theSage21/580511162116e1dfaefc2db4bc047e69 to your computer and use it in GitHub Desktop.
Visualizing C programs from the Codechef database.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import pickle
from collections import deque
import multiprocessing as mp
import editdistance
import os
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from tqdm import tqdm
samplesize = 1000
def wrapper(args):
s1, s2 = args
return editdistance.eval(s1, s2)
if not os.path.exists('distmatrix.csv'):
# ----------------------------------------------------------------------------------------
# Read the dataset
# ----------------------------------------------------------------------------------------
sols = pd.read_csv('solutions.csv', usecols=['QCode', 'SolutionID', 'Status', 'Language'])
sols = sols.loc[sols.Language == 'C'].drop('Language', axis=1).dropna() # Only C
sols = sols.loc[(sols.Status == 'accepted') | (sols.Status == 'wrong answer')]
sols = sols.dropna().sample(samplesize*3) # Thrice the sample size we need
#sols.Status = (sols.Status == 'accepted').astype(int)
f, s, t = [pd.read_csv('code/' + i + '.csv') for i in ['first', 'second', 'third']]
print('Concatenating solutions')
c = pd.concat([f, s, t]); del(f); del(s); del(t)
print('Merging with dataset')
df = sols.merge(c, how='left', on='SolutionID'); del(c); del(sols)
print(df.info())
# ----------------------------------------------------------------------------------------
# Create keyword structure
# ----------------------------------------------------------------------------------------
def to_keywords_only(code):
C_keys = '''auto,break,case,char,const,continue,default,do,double,else,enum,extern,float,
for,goto,if,int,long,register,return,short,signed,sizeof,static,struct,switch,typedef,
union,unsigned,void,volatile,while'''.replace('\\n', '').split(',')
special = '`1234567890-=+_)(*&^%$#@!~[]{}\|";:/?.>,<' + "'"
if isinstance(code, str):
code = code.replace('\n', ' ')
for sp in special:
code = code.replace(sp, ' {} '.format(sp))
useful = ' '.join((word for word in code.split(' ') if word in (C_keys + list(special))))
return useful
else:
return ''
tqdm.pandas(desc='applying_structure', ncols=100)
df['Structure'] = df.Solutions.progress_apply(to_keywords_only)
print(df.info())
# ----------------------------------------------------------------------------------------
# Take a sample from the dataset and calculate distance matrix
# ----------------------------------------------------------------------------------------
print('Taking {} programs as sample'.format(samplesize))
sample = df.dropna().sample(samplesize).dropna().copy()
sample.to_csv('sample.csv', index=False)
del(df) # We no longer need df
print('Calculating Distance Matrix')
distmatrix = deque()
with mp.Pool() as pool:
args = ((s1, s2) for s1 in sample.Structure.values for s2 in sample.Structure.values)
work = pool.imap_unordered(wrapper, args)
for value in tqdm(work, ncols=100, total=len(sample)**2):
distmatrix.append(value)
print('Saving distmatrix to disk')
distmatrix = pd.DataFrame({'s1':[x for x in sample.SolutionID.values for y in sample.SolutionID.values],
's2':[y for x in sample.SolutionID.values for y in sample.SolutionID.values],
'distance':distmatrix})
distmatrix['s1n'] = distmatrix.s1.str[1:].astype(int)
distmatrix['s2n'] = distmatrix.s2.str[1:].astype(int)
distmatrix.to_csv('distmatrix.csv', index=False)
sample = pd.read_csv('sample.csv')
distmatrix = pd.read_csv('distmatrix.csv')
print('Done')
# ----------------------------------------------------------------------------------------
# Calculate graph positions on x, y plane
# ----------------------------------------------------------------------------------------
print(distmatrix.info())
# Now we locate x, y locations for the points on the graph.
print('Making graph')
G = nx.Graph()
G.add_nodes_from(list(set(distmatrix['s1n'])))
def gen_edges(distmatrix):
for _, val in distmatrix[['s1n', 's2n', 'distance']].iterrows():
weight = val.values[0]
s1 = val.values[1]
s2 = val.values[2]
yield (s1, s2, {'weight': weight})
G.add_edges_from(gen_edges(distmatrix[['s1n','s2n','distance']]))
print('Calculating X, Y positions')
pos = {key: val for key, val in nx.spring_layout(G).items()}
sample['solidno'] = sample.SolutionID.str[1:].astype(int)
sample['xy'] = sample.solidno.map(pos)
sample['x'] = sample.xy.str[0].astype(float)
sample['y'] = sample.xy.str[1].astype(float)
sample = sample.drop('xy', axis=1)
print(sample.info())
sample.to_csv('sample.csv', index=False)
sample = pd.read_csv('sample.csv')
# ----------------------------------------------------------------------------------------
# Plot the figure
# ----------------------------------------------------------------------------------------
print('Plotting figure')
size = 7
sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False, col='Status')
plt.savefig("split.png") # save as png
sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False)
plt.savefig("single.png") # save as png
#plt.plot(df.x, df.y, 'o', alpha=0.5)
print('Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment