vered1986/find_papers_by_keywords.py

## find_papers_by_keywords.py
import re
import urllib
import pandas as pd

from bs4 import BeautifulSoup


def extract_authors(paper):
	"""
	Find the authors of the paper
	"""
	authors = []
	next = paper.parent.next_sibling
	while next is not None:
		if next.name != "a":
			next = next.next_sibling
			continue

		if next["href"].startswith("/people/"):
			authors.append(next.text)
			next = next.next_sibling

		else:
			break

	if len(authors) == 1:
		 authors_str = authors[0]
	elif len(authors) == 2:
		 authors_str = " and ".join(authors)
	else:
		authors_str = ", ".join(authors[:-1]) + ", and " + authors[-1]

	return authors_str


def find_papers(event, year, keywords):
	"""
	Find papers including any of the keywords that appeared in this venue.
	"""
	papers = []

	try:
		keywords = set(map(str.lower, keywords))
		url = f"https://aclanthology.org/events/{event}-{year}"
		user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
		headers = {'User-Agent' : user_agent}
		req = urllib.request.Request(url, None, headers)
		response = urllib.request.urlopen(req)
		html = response.read()
		soup = BeautifulSoup(html)
		all_links = soup.findAll("a", { "class" : "align-middle" })
		all_papers = [paper for paper in all_links if re.match(rf"/{year}.{event}-main.[0-9]+/", paper["href"]) and paper.get("title") != "Open PDF"]

		# Remove case span and find authors
		all_papers = [(paper.text.replace('<span class="acl-fixed-case">', "").replace("</span>", ""), f'https://aclanthology.org{paper["href"]}', extract_authors(paper)) for paper in all_papers]

		# Filter by keyword
		papers = [(title, authors, f'{event.upper()} {year}', url) for title, url, authors in all_papers if len(set(title.lower().split()).intersection(keywords)) > 0]
	except:
		pass

	print(f"Found {len(papers)} at {event.upper()} {year}")
	return papers


# Example usage: papers containing "commonsense" in the title from the main conferences in the last 2 years
for event in ["emnlp", "acl", "naacl", "eacl", "coling", "aacl"]:
	for year in ["2020", "2021"]:
		papers += find_papers(event, year, ["commonsense"])

papers = list(set(papers))
df = pd.DataFrame(papers, columns=["Title", "Authors", "Venue", "URL"])
df.to_csv("recent_commonsense_papers.csv")
	import re
	import urllib
	import pandas as pd

	from bs4 import BeautifulSoup


	def extract_authors(paper):
	"""
	Find the authors of the paper
	"""
	authors = []
	next = paper.parent.next_sibling
	while next is not None:
	if next.name != "a":
	next = next.next_sibling
	continue

	if next["href"].startswith("/people/"):
	authors.append(next.text)
	next = next.next_sibling

	else:
	break

	if len(authors) == 1:
	authors_str = authors[0]
	elif len(authors) == 2:
	authors_str = " and ".join(authors)
	else:
	authors_str = ", ".join(authors[:-1]) + ", and " + authors[-1]

	return authors_str


	def find_papers(event, year, keywords):
	"""
	Find papers including any of the keywords that appeared in this venue.
	"""
	papers = []

	try:
	keywords = set(map(str.lower, keywords))
	url = f"https://aclanthology.org/events/{event}-{year}"
	user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
	headers = {'User-Agent' : user_agent}
	req = urllib.request.Request(url, None, headers)
	response = urllib.request.urlopen(req)
	html = response.read()
	soup = BeautifulSoup(html)
	all_links = soup.findAll("a", { "class" : "align-middle" })
	all_papers = [paper for paper in all_links if re.match(rf"/{year}.{event}-main.[0-9]+/", paper["href"]) and paper.get("title") != "Open PDF"]

	# Remove case span and find authors
	all_papers = [(paper.text.replace('<span class="acl-fixed-case">', "").replace("</span>", ""), f'https://aclanthology.org{paper["href"]}', extract_authors(paper)) for paper in all_papers]

	# Filter by keyword
	papers = [(title, authors, f'{event.upper()} {year}', url) for title, url, authors in all_papers if len(set(title.lower().split()).intersection(keywords)) > 0]
	except:
	pass

	print(f"Found {len(papers)} at {event.upper()} {year}")
	return papers


	# Example usage: papers containing "commonsense" in the title from the main conferences in the last 2 years
	for event in ["emnlp", "acl", "naacl", "eacl", "coling", "aacl"]:
	for year in ["2020", "2021"]:
	papers += find_papers(event, year, ["commonsense"])

	papers = list(set(papers))
	df = pd.DataFrame(papers, columns=["Title", "Authors", "Venue", "URL"])
	df.to_csv("recent_commonsense_papers.csv")