Skip to content

Instantly share code, notes, and snippets.

@vered1986
Last active December 13, 2021 21:19
Show Gist options
  • Save vered1986/8b07f3338598984bacac57fd6f3c519a to your computer and use it in GitHub Desktop.
Save vered1986/8b07f3338598984bacac57fd6f3c519a to your computer and use it in GitHub Desktop.
Find papers by keywords in ACL Anthology
import re
import urllib
import pandas as pd
from bs4 import BeautifulSoup
def extract_authors(paper):
"""
Find the authors of the paper
"""
authors = []
next = paper.parent.next_sibling
while next is not None:
if next.name != "a":
next = next.next_sibling
continue
if next["href"].startswith("/people/"):
authors.append(next.text)
next = next.next_sibling
else:
break
if len(authors) == 1:
authors_str = authors[0]
elif len(authors) == 2:
authors_str = " and ".join(authors)
else:
authors_str = ", ".join(authors[:-1]) + ", and " + authors[-1]
return authors_str
def find_papers(event, year, keywords):
"""
Find papers including any of the keywords that appeared in this venue.
"""
papers = []
try:
keywords = set(map(str.lower, keywords))
url = f"https://aclanthology.org/events/{event}-{year}"
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = {'User-Agent' : user_agent}
req = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(req)
html = response.read()
soup = BeautifulSoup(html)
all_links = soup.findAll("a", { "class" : "align-middle" })
all_papers = [paper for paper in all_links if re.match(rf"/{year}.{event}-main.[0-9]+/", paper["href"]) and paper.get("title") != "Open PDF"]
# Remove case span and find authors
all_papers = [(paper.text.replace('<span class="acl-fixed-case">', "").replace("</span>", ""), f'https://aclanthology.org{paper["href"]}', extract_authors(paper)) for paper in all_papers]
# Filter by keyword
papers = [(title, authors, f'{event.upper()} {year}', url) for title, url, authors in all_papers if len(set(title.lower().split()).intersection(keywords)) > 0]
except:
pass
print(f"Found {len(papers)} at {event.upper()} {year}")
return papers
# Example usage: papers containing "commonsense" in the title from the main conferences in the last 2 years
for event in ["emnlp", "acl", "naacl", "eacl", "coling", "aacl"]:
for year in ["2020", "2021"]:
papers += find_papers(event, year, ["commonsense"])
papers = list(set(papers))
df = pd.DataFrame(papers, columns=["Title", "Authors", "Venue", "URL"])
df.to_csv("recent_commonsense_papers.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment