Skip to content

Instantly share code, notes, and snippets.

@jponttuset
Created September 7, 2018 08:38
Show Gist options
  • Save jponttuset/133c6450f2427ed45bf47591ab03bc72 to your computer and use it in GitHub Desktop.
Save jponttuset/133c6450f2427ed45bf47591ab03bc72 to your computer and use it in GitHub Desktop.
Scrape Paper Titles from CVF Open Access and Plot Evolution of GANs vs Deep in XKCD Style
import matplotlib.pyplot as plt
def get_percent_from_files(confs, keywords):
percents = []
for conf in confs:
file = open('titles/'+conf+'.txt', 'r')
all_titles = [line.rstrip('\n') for line in file.readlines()]
file.close()
count = 0
for title in all_titles:
for kword in keywords:
if kword + ' ' in title.lower() or ' ' + kword in title.lower() or kword + ':' in title.lower() or kword + ',' in title.lower():
count = count+1
break
percents.append(count/float(len(all_titles))*100)
return percents
conferences = ["CVPR2013","ICCV2013","CVPR2014","ECCV2014","CVPR2015","ICCV2015","CVPR2016","ECCV2016","CVPR2017", "ICCV2017", "CVPR2018", "ECCV2018"]
xval = range(0, len(conferences))
## GAN vs deep
# Set the style to XKCD
plt.xkcd()
plt.figure(figsize=(6,4))
# Plot the percents
plt.plot(xval,get_percent_from_files(conferences,['deep', 'cnn', 'cnns', 'convolutional', 'neural network', 'neural networks']), marker='o', label="Deep")
plt.plot(xval,get_percent_from_files(conferences,['adversarial','adversarially','gans', 'gan']), marker='o', label="GAN")
plt.plot(xval,get_percent_from_files(conferences,['lstm','lstms', 'rnn', 'rnns', 'polygon-rnn', 'recurrent']), marker='o', label="LSTM")
# Annotate and fine-tune
plt.title("Deep vs GAN", fontsize=13)
plt.legend(loc='upper left', fontsize=12)
plt.xticks(range(0, len(conferences)), [conf.replace("20","",1) for conf in conferences])
# Fine-tune the axis
ax = plt.gca();
ax.set_axisbelow(True)
ax.set_xlim([-0.1, len(conferences)-0.9]);
ax.set_ylim([-0.5, 25]);
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(12)
tick.label.set_rotation(20)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(12)
plt.subplots_adjust(bottom=0.12)
plt.ylabel("Percentage of papers (%)", fontsize=12)
# Save
plt.savefig('deep_vs_gan_evolution.png', dpi=200)
plt.close()
import requests
from lxml import html
conference = "ECCV2018"
# Get the HTML text and find the classes of type 'ptitle'
response = requests.get("http://openaccess.thecvf.com/"+conference+".py")
tree = html.fromstring(response.text)
papers = tree.find_class('ptitle')
# Get all titles in a list
all_titles = []
for paper in papers:
title = paper.xpath('a/text()')
all_titles.append(title[0])
# Print to file
f = open(conference + '.txt', 'w')
for title in all_titles:
f.write(title.encode('ascii', errors='backslashreplace')+'\n')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment