mcarletti/create_wordcloud.py

## create_wordcloud.py
import sys
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

filename = sys.argv[1]
print(filename)

text = open(filename, "r").read()

blacklist = [
    "et", "al", "al.", "fig", "figure", "section", "using", "used", "show", "results", "method", "proposed", "paper", "for", "and", "by", "in", "the",
    "we", "our", "this", "that", "to", "of", "a", "an", "is", "are", "on", "with", "from", "as", "at", "it", "be", "can", "which", "has", "have", "been",
    "learning", "via", "based", "toward", "towards", "over", "under", "above", "below", "between", "among", "within", "without", "across", "along",
]

wordcloud = WordCloud(max_font_size=120, width=1280, height=920, stopwords=blacklist).generate_from_text(text)

plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.show()

## openaccess_thecvf_parser.py
import requests
from bs4 import BeautifulSoup
from tika import parser
from tqdm import tqdm
import numpy as np

def get_parsed_html(url):
    res = requests.get(url)
    html = res.content
    parsed_html = BeautifulSoup(html, features="lxml")
    return parsed_html

def extract_abstract(url):
    html = get_parsed_html(url)
    content = html.body.find("div", attrs={"id": "abstract"})
    if content is None:
        return ""
    return content.text[1:-1]

def extract_abstract_from_pdf(filename):
    raw = parser.from_file(filename)
    text = raw["content"]
    abstract = text[text.find("Abstract")+8:text.find("1. Introduction")]
    return abstract

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--get_abstracts", action="store_true")
    parser.add_argument("--conference_names", nargs="+", type=str, default=[])
    args = parser.parse_args()

    titles = []
    abstracts = []

    root = "http://openaccess.thecvf.com/"
    html = get_parsed_html(root + "menu.py")

    data = html.body.findAll("dd")

    for x in data:

        conference_name = x.text[1:x.text.find("[")-1]

        if conference_name.split(" ")[0] not in args.conference_names and len(args.conference_names) > 0:
            print("Skipping:", conference_name)
            continue

        print("Processing:", conference_name)

        conference_url = x.find("a")["href"]
        html = get_parsed_html(root + conference_url + "?day=all")
        content = html.body.findAll("dt", attrs={"class": "ptitle"})

        for x in tqdm(content):
            paper_title = x.text
            paper_url = x.find("a")["href"]
            if args.get_abstracts:
                paper_abstract = extract_abstract(root + paper_url)
            # save title and abstract
            titles.append( paper_title )
            if args.get_abstracts:
                abstracts.append( paper_abstract )

    np.savetxt("cvf_titles.txt", titles, "%s")
    if args.get_abstracts:
        np.savetxt("cvf_abstracts.txt", abstracts, "%s")
	import sys
	import numpy as np
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud

	filename = sys.argv[1]
	print(filename)

	text = open(filename, "r").read()

	blacklist = [
	"et", "al", "al.", "fig", "figure", "section", "using", "used", "show", "results", "method", "proposed", "paper", "for", "and", "by", "in", "the",
	"we", "our", "this", "that", "to", "of", "a", "an", "is", "are", "on", "with", "from", "as", "at", "it", "be", "can", "which", "has", "have", "been",
	"learning", "via", "based", "toward", "towards", "over", "under", "above", "below", "between", "among", "within", "without", "across", "along",
	]

	wordcloud = WordCloud(max_font_size=120, width=1280, height=920, stopwords=blacklist).generate_from_text(text)

	plt.figure()
	plt.imshow(wordcloud, interpolation="bilinear")
	plt.axis("off")
	plt.tight_layout()
	plt.show()
	import requests
	from bs4 import BeautifulSoup
	from tika import parser
	from tqdm import tqdm
	import numpy as np

	def get_parsed_html(url):
	res = requests.get(url)
	html = res.content
	parsed_html = BeautifulSoup(html, features="lxml")
	return parsed_html

	def extract_abstract(url):
	html = get_parsed_html(url)
	content = html.body.find("div", attrs={"id": "abstract"})
	if content is None:
	return ""
	return content.text[1:-1]

	def extract_abstract_from_pdf(filename):
	raw = parser.from_file(filename)
	text = raw["content"]
	abstract = text[text.find("Abstract")+8:text.find("1. Introduction")]
	return abstract

	if __name__ == "__main__":

	parser = argparse.ArgumentParser()
	parser.add_argument("--get_abstracts", action="store_true")
	parser.add_argument("--conference_names", nargs="+", type=str, default=[])
	args = parser.parse_args()

	titles = []
	abstracts = []

	root = "http://openaccess.thecvf.com/"
	html = get_parsed_html(root + "menu.py")

	data = html.body.findAll("dd")

	for x in data:

	conference_name = x.text[1:x.text.find("[")-1]

	if conference_name.split(" ")[0] not in args.conference_names and len(args.conference_names) > 0:
	print("Skipping:", conference_name)
	continue

	print("Processing:", conference_name)

	conference_url = x.find("a")["href"]
	html = get_parsed_html(root + conference_url + "?day=all")
	content = html.body.findAll("dt", attrs={"class": "ptitle"})

	for x in tqdm(content):
	paper_title = x.text
	paper_url = x.find("a")["href"]
	if args.get_abstracts:
	paper_abstract = extract_abstract(root + paper_url)
	# save title and abstract
	titles.append( paper_title )
	if args.get_abstracts:
	abstracts.append( paper_abstract )

	np.savetxt("cvf_titles.txt", titles, "%s")
	if args.get_abstracts:
	np.savetxt("cvf_abstracts.txt", abstracts, "%s")