Last active
June 12, 2020 21:13
-
-
Save GuyAglionby/8da5bf4852833c3d87516b1265282bda to your computer and use it in GitHub Desktop.
Convert csv with columns (title, author, paper type) to something that MiniConf will take in
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
from typing import List | |
re_author_split = re.compile(' and |, ') | |
re_curly_brace = re.compile('{([A-Za-z0-9 ]+)}') | |
acceptable_chars = '[\'`\/:\-()?\w\s\d.,]+' | |
re_newline = re.compile('[ ]*\n[ ]*') | |
re_inline_italics = re.compile(r'{\\(?:em|it) (' + acceptable_chars + ')}') | |
re_italics = re.compile(r'\\(?:emph|textit){(' + acceptable_chars + ')}') | |
re_inline_sc = re.compile(r'{\\sc (' + acceptable_chars + ')}') | |
re_textsc = re.compile(r'\\textsc{(' + acceptable_chars + ')}') | |
re_inline_bf = re.compile(r'{\\bf (' + acceptable_chars + ')}') | |
re_textbf = re.compile(r'\\textbf{(' + acceptable_chars + ')}') | |
re_textrm = re.compile(r'\\textrm{(' + acceptable_chars + ')}') | |
re_url = re.compile(r'\\url{(' + acceptable_chars + ')}') | |
re_footnote = re.compile(r'\\footnote{(' + acceptable_chars + ')}') | |
re_mathmode = re.compile(r'\$(.*?)\$') | |
re_cite = re.compile(r'~?\\cite[pt]?{(' + acceptable_chars + ')}') | |
re_multi_space = re.compile(r'\s+') | |
re_superscript = re.compile(r'\\textsuperscript{(\d+)}') | |
re_subscript = re.compile(r'\\textsubscript{(\d+)}') | |
direct_replacements = {'\%': '%', '\&': '&', '$\sim$': '~', '\\alpha': 'ɑ', '\\beta': 'β', '\\gamma': 'ɣ', '\\propto': '∝', '\\Rightarrow': '⇒', '\\Leftrightarrow': '⇔', '\\Leftarrow': '⇐'} | |
subscript_map = {'0': '₀', '1': '₁', '2': '₂', '3': '₃', '4': '₄', '5': '₅', '6': '₆', '7': '₇', '8': '₈', '9': '₉'} | |
superscript_map = {'0': '⁰', '1': '¹', '2': '²', '3': '³', '4': '⁴', '5': '⁵', '6': '⁶', '7': '⁷', '8': '⁸', '9': '⁹'} | |
def convert_superscript_match(match): | |
return ''.join([superscript_map[char] for char in str(match.group(1))]) | |
def convert_subscript_match(match): | |
return ''.join([subscript_map[char] for char in str(match.group(1))]) | |
def clean_abstract(abstract): | |
for source, dest in direct_replacements.items(): | |
abstract = abstract.replace(source, dest) | |
abstract = re_newline.sub(' ', abstract) | |
abstract = re_superscript.sub(convert_superscript_match, abstract) | |
abstract = re_subscript.sub(convert_subscript_match, abstract) | |
abstract = re_textsc.sub(r'\1', abstract) | |
abstract = re_inline_sc.sub(r'\1', abstract) | |
abstract = re_textrm.sub(r'\1', abstract) | |
abstract = re_textbf.sub(r'\1', abstract) | |
abstract = re_inline_bf.sub(r'\1', abstract) | |
abstract = re_cite.sub(' ', abstract) | |
abstract = re_url.sub(r'\1', abstract) | |
abstract = re_footnote.sub(r' (\1)', abstract) | |
abstract = re_mathmode.sub(r'\1', abstract) | |
abstract = re_inline_italics.sub(r'\1', abstract) | |
abstract = re_italics.sub(r'\1', abstract) | |
abstract = re_multi_space.sub(' ', abstract) | |
return abstract | |
def clean_title(paper_title): | |
for source, dest in direct_replacements.items(): | |
paper_title = paper_title.replace(source, dest) | |
paper_title = re_curly_brace.sub(r'\1', paper_title) | |
return paper_title | |
def miniconf_join_list(lst): | |
return '|'.join(lst) | |
def parse_authors(author_string): | |
return re_author_split.split(author_string) | |
def main(): | |
filename = 'ACL2020 Accepted Papers Information (to share with other chairs) - Sheet1.csv' | |
papers = pd.read_csv(filename) | |
track_filename = 'paper_tracks.xls' | |
track_details = pd.read_excel(track_filename) | |
assert set(papers['Submission ID'].values) == set(track_details['ID'].values) | |
papers = papers.merge(right=track_details, left_on='Submission ID', right_on='ID') | |
acl_id_stub = '1.' | |
papers['authors'] = papers['Authors'].apply(lambda x: miniconf_join_list(parse_authors(x))) | |
papers['Abstract'] = papers['Abstract'].apply(clean_abstract) | |
papers['title'] = papers['title'].apply(clean_title) | |
papers['UID'] = papers['Line order'].apply(lambda x: acl_id_stub + str(x)) | |
papers['paper_type'] = papers['Submission Type'] | |
papers.rename(columns={'Abstract': 'abstract', 'track': 'session'}, inplace=True) | |
papers['keywords'] = '' | |
papers = papers.loc[:, ['Line order', 'UID', 'title', 'authors', 'abstract', 'keywords', 'session', 'paper_type']] | |
papers.sort_values(by='Line order', axis=0, inplace=True) | |
papers.drop(columns='Line order', inplace=True) | |
papers.to_csv('papers.csv', index=False) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment