Skip to content

Instantly share code, notes, and snippets.

@lolipopshock
Last active July 12, 2020 17:18
Show Gist options
  • Save lolipopshock/af06c2af05acf773ff90811b62bc7fe1 to your computer and use it in GitHub Desktop.
Save lolipopshock/af06c2af05acf773ff90811b62bc7fe1 to your computer and use it in GitHub Desktop.
[CV Paper Info Table] Download Paper Information from thecvf Website
"""Download the paper information from thecvf website and save them into csv files
Examples:
>> python download_cvf_paper.py --save_path 'save'
Saved Files:
* cvf_all.csv
both paper and workshop papers listed on the thecvf website
* cvf_papers.csv
paper information listed on the thecvf website
* cvf_workshops.csv
workshop information listed on the thecvf website
* table features
* name
name of the paper
* authors
authors string separated with ', '
* pdf_link
the URL of the pdf file of the associated paper
* supp_link
the URL of the pdf file of the associated paper supplementary materials
* bibref
the bibref of the given paper
* conf_name
conference name of the paper. For main conf papers, it's like 'CVPR2020'. For
workshop papers, it's like 'CVPR2020_workshops-Skin Image Analysis', where the
conference name and workshop name is concatenated with '-'.
"""
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import re, os
import pandas as pd
import argparse
cvf_base = 'https://openaccess.thecvf.com'
def fetch_meta_table():
cvf_page = BeautifulSoup(requests.get(f'{cvf_base}/menu').content, "lxml")
table = []
for ele in cvf_page.find('dl').findChildren('dd'):
table.append(list(map(lambda x: x['href'], ele.find_all('a'))))
return pd.DataFrame(table, columns=['main', 'workshop'])
def fetch_conf(df_conf):
conf_name = df_conf['main'].replace('.py', '')
conf_suffix = df_conf['main']
conf_page = BeautifulSoup(requests.get(f'{cvf_base}/{conf_suffix}').content, "lxml")
paper_infos = fetech_paper_from_conf_page(conf_page)
df = pd.DataFrame(paper_infos)
df['conf_name'] = conf_name
return df
def fetech_paper_from_conf_page(conf_page):
papers = conf_page.find_all(class_='ptitle')
all_paper_info = []
for paper in papers:
paper_author = paper.find_next_sibling()
paper_meta = paper_author.find_next_sibling()
paper_links = paper_meta.find_all('a')[:2]
paper_info = dict(
name = paper.text,
authors = ', '.join(map(lambda x: x.text.strip(), paper_author.find_all('a'))),
pdf_link = paper_links[0]['href'],
supp_link = paper_links[1].get('href', ''),
bibref = paper_meta.find(class_='bibref').text.strip()
)
all_paper_info.append(paper_info)
return all_paper_info
def fetech_workshop_table(df_conf):
conf_name = df_conf['main'].replace('.py', '')
conf_suffix = df_conf['workshop']
conf_page = BeautifulSoup(requests.get(f'{cvf_base}/{conf_suffix}').content, "lxml")
workshop_table = []
for ele in conf_page.find_all('dd'):
workshop_info = ele.find('a')
if workshop_info.text.strip().lower() != 'back':
workshop_table.append([
workshop_info.text.strip(), workshop_info.get('href', '')])
df = pd.DataFrame(workshop_table, columns=['workshop_name', 'url'])
df['conf_name'] = conf_suffix.replace('/menu.py', '').replace('/', '')
return df
def fetch_workshop(df_workshop):
conf_name = df_workshop['conf_name']
workshop_name = conf_name + '-' + df_workshop['workshop_name']
workshop_suffix = conf_name + '/' + df_workshop['url']
workshop_page = BeautifulSoup(requests.get(f'{cvf_base}/{workshop_suffix}').content, "lxml")
paper_infos = fetech_paper_from_conf_page(workshop_page)
df = pd.DataFrame(paper_infos)
df['conf_name'] = workshop_name
return df
def fetch_workshops_for_conf(workshop_table):
all_workshops = []
for _, df_workshop in workshop_table.iterrows():
all_workshops.append(fetch_workshop(df_workshop))
return pd.concat(all_workshops, axis=0).reset_index(drop=True)
parser = argparse.ArgumentParser()
parser.add_argument('--save_path', type=str, default='.', help='The folder to save all the downloaded files, default to the current directory')
if __name__ == "__main__":
args = parser.parse_args()
meta_table = fetch_meta_table()
paper_dfs = []
workshop_dfs = []
for _, df_conf in tqdm(meta_table.iterrows()):
paper_df = fetch_conf(df_conf)
workshop_table = fetech_workshop_table(df_conf)
workshop_df = fetch_workshops_for_conf(workshop_table)
paper_dfs.append(paper_df)
workshop_dfs.append(workshop_df)
if not os.path.exists(args.save_path):
os.makedirs(args.save_path, exist_ok=True)
workshop_df = pd.concat(workshop_dfs)
workshop_df['pdf_link'] = workshop_df['pdf_link'].str.replace('../', '', regex=False)
workshop_df.to_csv(os.path.join(args.save_path, 'cvf_workshops.csv'), index=False)
paper_df = pd.concat(paper_dfs)
paper_df.to_csv(os.path.join(args.save_path, 'cvf_papers.csv'), index=False)
pd.concat([paper_df, workshop_df]).to_csv(os.path.join(args.save_path, 'cvf_all.csv'), index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment