Last active
July 12, 2020 17:18
-
-
Save lolipopshock/af06c2af05acf773ff90811b62bc7fe1 to your computer and use it in GitHub Desktop.
[CV Paper Info Table] Download Paper Information from thecvf Website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Download the paper information from thecvf website and save them into csv files | |
Examples: | |
>> python download_cvf_paper.py --save_path 'save' | |
Saved Files: | |
* cvf_all.csv | |
both paper and workshop papers listed on the thecvf website | |
* cvf_papers.csv | |
paper information listed on the thecvf website | |
* cvf_workshops.csv | |
workshop information listed on the thecvf website | |
* table features | |
* name | |
name of the paper | |
* authors | |
authors string separated with ', ' | |
* pdf_link | |
the URL of the pdf file of the associated paper | |
* supp_link | |
the URL of the pdf file of the associated paper supplementary materials | |
* bibref | |
the bibref of the given paper | |
* conf_name | |
conference name of the paper. For main conf papers, it's like 'CVPR2020'. For | |
workshop papers, it's like 'CVPR2020_workshops-Skin Image Analysis', where the | |
conference name and workshop name is concatenated with '-'. | |
""" | |
from bs4 import BeautifulSoup | |
import requests | |
from tqdm import tqdm | |
import re, os | |
import pandas as pd | |
import argparse | |
cvf_base = 'https://openaccess.thecvf.com' | |
def fetch_meta_table(): | |
cvf_page = BeautifulSoup(requests.get(f'{cvf_base}/menu').content, "lxml") | |
table = [] | |
for ele in cvf_page.find('dl').findChildren('dd'): | |
table.append(list(map(lambda x: x['href'], ele.find_all('a')))) | |
return pd.DataFrame(table, columns=['main', 'workshop']) | |
def fetch_conf(df_conf): | |
conf_name = df_conf['main'].replace('.py', '') | |
conf_suffix = df_conf['main'] | |
conf_page = BeautifulSoup(requests.get(f'{cvf_base}/{conf_suffix}').content, "lxml") | |
paper_infos = fetech_paper_from_conf_page(conf_page) | |
df = pd.DataFrame(paper_infos) | |
df['conf_name'] = conf_name | |
return df | |
def fetech_paper_from_conf_page(conf_page): | |
papers = conf_page.find_all(class_='ptitle') | |
all_paper_info = [] | |
for paper in papers: | |
paper_author = paper.find_next_sibling() | |
paper_meta = paper_author.find_next_sibling() | |
paper_links = paper_meta.find_all('a')[:2] | |
paper_info = dict( | |
name = paper.text, | |
authors = ', '.join(map(lambda x: x.text.strip(), paper_author.find_all('a'))), | |
pdf_link = paper_links[0]['href'], | |
supp_link = paper_links[1].get('href', ''), | |
bibref = paper_meta.find(class_='bibref').text.strip() | |
) | |
all_paper_info.append(paper_info) | |
return all_paper_info | |
def fetech_workshop_table(df_conf): | |
conf_name = df_conf['main'].replace('.py', '') | |
conf_suffix = df_conf['workshop'] | |
conf_page = BeautifulSoup(requests.get(f'{cvf_base}/{conf_suffix}').content, "lxml") | |
workshop_table = [] | |
for ele in conf_page.find_all('dd'): | |
workshop_info = ele.find('a') | |
if workshop_info.text.strip().lower() != 'back': | |
workshop_table.append([ | |
workshop_info.text.strip(), workshop_info.get('href', '')]) | |
df = pd.DataFrame(workshop_table, columns=['workshop_name', 'url']) | |
df['conf_name'] = conf_suffix.replace('/menu.py', '').replace('/', '') | |
return df | |
def fetch_workshop(df_workshop): | |
conf_name = df_workshop['conf_name'] | |
workshop_name = conf_name + '-' + df_workshop['workshop_name'] | |
workshop_suffix = conf_name + '/' + df_workshop['url'] | |
workshop_page = BeautifulSoup(requests.get(f'{cvf_base}/{workshop_suffix}').content, "lxml") | |
paper_infos = fetech_paper_from_conf_page(workshop_page) | |
df = pd.DataFrame(paper_infos) | |
df['conf_name'] = workshop_name | |
return df | |
def fetch_workshops_for_conf(workshop_table): | |
all_workshops = [] | |
for _, df_workshop in workshop_table.iterrows(): | |
all_workshops.append(fetch_workshop(df_workshop)) | |
return pd.concat(all_workshops, axis=0).reset_index(drop=True) | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--save_path', type=str, default='.', help='The folder to save all the downloaded files, default to the current directory') | |
if __name__ == "__main__": | |
args = parser.parse_args() | |
meta_table = fetch_meta_table() | |
paper_dfs = [] | |
workshop_dfs = [] | |
for _, df_conf in tqdm(meta_table.iterrows()): | |
paper_df = fetch_conf(df_conf) | |
workshop_table = fetech_workshop_table(df_conf) | |
workshop_df = fetch_workshops_for_conf(workshop_table) | |
paper_dfs.append(paper_df) | |
workshop_dfs.append(workshop_df) | |
if not os.path.exists(args.save_path): | |
os.makedirs(args.save_path, exist_ok=True) | |
workshop_df = pd.concat(workshop_dfs) | |
workshop_df['pdf_link'] = workshop_df['pdf_link'].str.replace('../', '', regex=False) | |
workshop_df.to_csv(os.path.join(args.save_path, 'cvf_workshops.csv'), index=False) | |
paper_df = pd.concat(paper_dfs) | |
paper_df.to_csv(os.path.join(args.save_path, 'cvf_papers.csv'), index=False) | |
pd.concat([paper_df, workshop_df]).to_csv(os.path.join(args.save_path, 'cvf_all.csv'), index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment