Skip to content

Instantly share code, notes, and snippets.

@mcarletti
Last active March 7, 2024 11:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mcarletti/3d9b6435566083deb4a2aa644920ab07 to your computer and use it in GitHub Desktop.
Save mcarletti/3d9b6435566083deb4a2aa644920ab07 to your computer and use it in GitHub Desktop.
Parse main CVF conference titles and abstracts (eg, CVPR, ICCV)
import sys
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
filename = sys.argv[1]
print(filename)
text = open(filename, "r").read()
blacklist = [
"et", "al", "al.", "fig", "figure", "section", "using", "used", "show", "results", "method", "proposed", "paper", "for", "and", "by", "in", "the",
"we", "our", "this", "that", "to", "of", "a", "an", "is", "are", "on", "with", "from", "as", "at", "it", "be", "can", "which", "has", "have", "been",
"learning", "via", "based", "toward", "towards", "over", "under", "above", "below", "between", "among", "within", "without", "across", "along",
]
wordcloud = WordCloud(max_font_size=120, width=1280, height=920, stopwords=blacklist).generate_from_text(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.show()
import requests
from bs4 import BeautifulSoup
from tika import parser
from tqdm import tqdm
import numpy as np
def get_parsed_html(url):
res = requests.get(url)
html = res.content
parsed_html = BeautifulSoup(html, features="lxml")
return parsed_html
def extract_abstract(url):
html = get_parsed_html(url)
content = html.body.find("div", attrs={"id": "abstract"})
if content is None:
return ""
return content.text[1:-1]
def extract_abstract_from_pdf(filename):
raw = parser.from_file(filename)
text = raw["content"]
abstract = text[text.find("Abstract")+8:text.find("1. Introduction")]
return abstract
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--get_abstracts", action="store_true")
parser.add_argument("--conference_names", nargs="+", type=str, default=[])
args = parser.parse_args()
titles = []
abstracts = []
root = "http://openaccess.thecvf.com/"
html = get_parsed_html(root + "menu.py")
data = html.body.findAll("dd")
for x in data:
conference_name = x.text[1:x.text.find("[")-1]
if conference_name.split(" ")[0] not in args.conference_names and len(args.conference_names) > 0:
print("Skipping:", conference_name)
continue
print("Processing:", conference_name)
conference_url = x.find("a")["href"]
html = get_parsed_html(root + conference_url + "?day=all")
content = html.body.findAll("dt", attrs={"class": "ptitle"})
for x in tqdm(content):
paper_title = x.text
paper_url = x.find("a")["href"]
if args.get_abstracts:
paper_abstract = extract_abstract(root + paper_url)
# save title and abstract
titles.append( paper_title )
if args.get_abstracts:
abstracts.append( paper_abstract )
np.savetxt("cvf_titles.txt", titles, "%s")
if args.get_abstracts:
np.savetxt("cvf_abstracts.txt", abstracts, "%s")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment