lolipopshock/download_cvf_paper_info.py

## download_cvf_paper_info.py
"""Download the paper information from thecvf website and save them into csv files

Examples:
    >> python download_cvf_paper.py --save_path 'save'

Saved Files:

    * cvf_all.csv
        both paper and workshop papers listed on the thecvf website
    * cvf_papers.csv
        paper information listed on the thecvf website
    * cvf_workshops.csv
        workshop information listed on the thecvf website
    * table features
        * name
            name of the paper
        * authors
            authors string separated with ', '
        * pdf_link
            the URL of the pdf file of the associated paper
        * supp_link
            the URL of the pdf file of the associated paper supplementary materials
        * bibref
            the bibref of the given paper
        * conf_name
            conference name of the paper. For main conf papers, it's like 'CVPR2020'. For
            workshop papers, it's like 'CVPR2020_workshops-Skin Image Analysis', where the
            conference name and workshop name is concatenated with '-'.
"""

from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import re, os
import pandas as pd
import argparse

cvf_base = 'https://openaccess.thecvf.com'

def fetch_meta_table():
    cvf_page = BeautifulSoup(requests.get(f'{cvf_base}/menu').content, "lxml")
    table = []
    for ele in cvf_page.find('dl').findChildren('dd'):
        table.append(list(map(lambda x: x['href'], ele.find_all('a'))))

    return pd.DataFrame(table, columns=['main', 'workshop'])

def fetch_conf(df_conf):
    conf_name = df_conf['main'].replace('.py', '')
    conf_suffix = df_conf['main']
    conf_page = BeautifulSoup(requests.get(f'{cvf_base}/{conf_suffix}').content, "lxml")

    paper_infos = fetech_paper_from_conf_page(conf_page)
    df = pd.DataFrame(paper_infos)
    df['conf_name'] = conf_name
    return df

def fetech_paper_from_conf_page(conf_page):
    papers = conf_page.find_all(class_='ptitle')

    all_paper_info = []
    for paper in papers:
        paper_author = paper.find_next_sibling()
        paper_meta = paper_author.find_next_sibling()
        paper_links = paper_meta.find_all('a')[:2]
        paper_info = dict(
            name = paper.text,
            authors = ', '.join(map(lambda x: x.text.strip(), paper_author.find_all('a'))),
            pdf_link = paper_links[0]['href'],
            supp_link = paper_links[1].get('href', ''),
            bibref = paper_meta.find(class_='bibref').text.strip()
        )
        all_paper_info.append(paper_info)

    return all_paper_info

def fetech_workshop_table(df_conf):
    conf_name = df_conf['main'].replace('.py', '')
    conf_suffix = df_conf['workshop']
    conf_page = BeautifulSoup(requests.get(f'{cvf_base}/{conf_suffix}').content, "lxml")

    workshop_table = []
    for ele in conf_page.find_all('dd'):
        workshop_info = ele.find('a')
        if workshop_info.text.strip().lower() != 'back':
            workshop_table.append([
                workshop_info.text.strip(), workshop_info.get('href', '')])

    df = pd.DataFrame(workshop_table, columns=['workshop_name', 'url'])
    df['conf_name'] = conf_suffix.replace('/menu.py', '').replace('/', '')
    return df

def fetch_workshop(df_workshop):
    conf_name = df_workshop['conf_name']
    workshop_name = conf_name + '-' + df_workshop['workshop_name']
    workshop_suffix = conf_name + '/' + df_workshop['url']
    workshop_page = BeautifulSoup(requests.get(f'{cvf_base}/{workshop_suffix}').content, "lxml")

    paper_infos = fetech_paper_from_conf_page(workshop_page)
    df = pd.DataFrame(paper_infos)
    df['conf_name'] = workshop_name
    return df

def fetch_workshops_for_conf(workshop_table):

    all_workshops = []
    for _, df_workshop in workshop_table.iterrows():
        all_workshops.append(fetch_workshop(df_workshop))

    return pd.concat(all_workshops, axis=0).reset_index(drop=True)


parser = argparse.ArgumentParser()
parser.add_argument('--save_path', type=str, default='.', help='The folder to save all the downloaded files, default to the current directory')


if __name__ == "__main__":
    args = parser.parse_args()

    meta_table = fetch_meta_table()

    paper_dfs = []
    workshop_dfs = []

    for _, df_conf in tqdm(meta_table.iterrows()):

        paper_df = fetch_conf(df_conf)
        workshop_table = fetech_workshop_table(df_conf)
        workshop_df = fetch_workshops_for_conf(workshop_table)
        paper_dfs.append(paper_df)
        workshop_dfs.append(workshop_df)

    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path, exist_ok=True)

    workshop_df = pd.concat(workshop_dfs)
    workshop_df['pdf_link'] = workshop_df['pdf_link'].str.replace('../', '', regex=False)
    workshop_df.to_csv(os.path.join(args.save_path, 'cvf_workshops.csv'), index=False)

    paper_df = pd.concat(paper_dfs)
    paper_df.to_csv(os.path.join(args.save_path, 'cvf_papers.csv'), index=False)

    pd.concat([paper_df, workshop_df]).to_csv(os.path.join(args.save_path, 'cvf_all.csv'), index=False)
	"""Download the paper information from thecvf website and save them into csv files

	Examples:
	>> python download_cvf_paper.py --save_path 'save'

	Saved Files:

	* cvf_all.csv
	both paper and workshop papers listed on the thecvf website
	* cvf_papers.csv
	paper information listed on the thecvf website
	* cvf_workshops.csv
	workshop information listed on the thecvf website
	* table features
	* name
	name of the paper
	* authors
	authors string separated with ', '
	* pdf_link
	the URL of the pdf file of the associated paper
	* supp_link
	the URL of the pdf file of the associated paper supplementary materials
	* bibref
	the bibref of the given paper
	* conf_name
	conference name of the paper. For main conf papers, it's like 'CVPR2020'. For
	workshop papers, it's like 'CVPR2020_workshops-Skin Image Analysis', where the
	conference name and workshop name is concatenated with '-'.
	"""

	from bs4 import BeautifulSoup
	import requests
	from tqdm import tqdm
	import re, os
	import pandas as pd
	import argparse

	cvf_base = 'https://openaccess.thecvf.com'

	def fetch_meta_table():
	cvf_page = BeautifulSoup(requests.get(f'{cvf_base}/menu').content, "lxml")
	table = []
	for ele in cvf_page.find('dl').findChildren('dd'):
	table.append(list(map(lambda x: x['href'], ele.find_all('a'))))

	return pd.DataFrame(table, columns=['main', 'workshop'])

	def fetch_conf(df_conf):
	conf_name = df_conf['main'].replace('.py', '')
	conf_suffix = df_conf['main']
	conf_page = BeautifulSoup(requests.get(f'{cvf_base}/{conf_suffix}').content, "lxml")

	paper_infos = fetech_paper_from_conf_page(conf_page)
	df = pd.DataFrame(paper_infos)
	df['conf_name'] = conf_name
	return df

	def fetech_paper_from_conf_page(conf_page):
	papers = conf_page.find_all(class_='ptitle')

	all_paper_info = []
	for paper in papers:
	paper_author = paper.find_next_sibling()
	paper_meta = paper_author.find_next_sibling()
	paper_links = paper_meta.find_all('a')[:2]
	paper_info = dict(
	name = paper.text,
	authors = ', '.join(map(lambda x: x.text.strip(), paper_author.find_all('a'))),
	pdf_link = paper_links[0]['href'],
	supp_link = paper_links[1].get('href', ''),
	bibref = paper_meta.find(class_='bibref').text.strip()
	)
	all_paper_info.append(paper_info)

	return all_paper_info

	def fetech_workshop_table(df_conf):
	conf_name = df_conf['main'].replace('.py', '')
	conf_suffix = df_conf['workshop']
	conf_page = BeautifulSoup(requests.get(f'{cvf_base}/{conf_suffix}').content, "lxml")

	workshop_table = []
	for ele in conf_page.find_all('dd'):
	workshop_info = ele.find('a')
	if workshop_info.text.strip().lower() != 'back':
	workshop_table.append([
	workshop_info.text.strip(), workshop_info.get('href', '')])

	df = pd.DataFrame(workshop_table, columns=['workshop_name', 'url'])
	df['conf_name'] = conf_suffix.replace('/menu.py', '').replace('/', '')
	return df

	def fetch_workshop(df_workshop):
	conf_name = df_workshop['conf_name']
	workshop_name = conf_name + '-' + df_workshop['workshop_name']
	workshop_suffix = conf_name + '/' + df_workshop['url']
	workshop_page = BeautifulSoup(requests.get(f'{cvf_base}/{workshop_suffix}').content, "lxml")

	paper_infos = fetech_paper_from_conf_page(workshop_page)
	df = pd.DataFrame(paper_infos)
	df['conf_name'] = workshop_name
	return df

	def fetch_workshops_for_conf(workshop_table):

	all_workshops = []
	for _, df_workshop in workshop_table.iterrows():
	all_workshops.append(fetch_workshop(df_workshop))

	return pd.concat(all_workshops, axis=0).reset_index(drop=True)


	parser = argparse.ArgumentParser()
	parser.add_argument('--save_path', type=str, default='.', help='The folder to save all the downloaded files, default to the current directory')


	if __name__ == "__main__":
	args = parser.parse_args()

	meta_table = fetch_meta_table()

	paper_dfs = []
	workshop_dfs = []

	for _, df_conf in tqdm(meta_table.iterrows()):

	paper_df = fetch_conf(df_conf)
	workshop_table = fetech_workshop_table(df_conf)
	workshop_df = fetch_workshops_for_conf(workshop_table)
	paper_dfs.append(paper_df)
	workshop_dfs.append(workshop_df)

	if not os.path.exists(args.save_path):
	os.makedirs(args.save_path, exist_ok=True)

	workshop_df = pd.concat(workshop_dfs)
	workshop_df['pdf_link'] = workshop_df['pdf_link'].str.replace('../', '', regex=False)
	workshop_df.to_csv(os.path.join(args.save_path, 'cvf_workshops.csv'), index=False)

	paper_df = pd.concat(paper_dfs)
	paper_df.to_csv(os.path.join(args.save_path, 'cvf_papers.csv'), index=False)

	pd.concat([paper_df, workshop_df]).to_csv(os.path.join(args.save_path, 'cvf_all.csv'), index=False)