Skip to content

Instantly share code, notes, and snippets.

@GuyAglionby
Last active June 12, 2020 21:13
Show Gist options
  • Save GuyAglionby/8da5bf4852833c3d87516b1265282bda to your computer and use it in GitHub Desktop.
Save GuyAglionby/8da5bf4852833c3d87516b1265282bda to your computer and use it in GitHub Desktop.
Convert csv with columns (title, author, paper type) to something that MiniConf will take in
import pandas as pd
import re
from typing import List
re_author_split = re.compile(' and |, ')
re_curly_brace = re.compile('{([A-Za-z0-9 ]+)}')
acceptable_chars = '[\'`\/:\-()?\w\s\d.,]+'
re_newline = re.compile('[ ]*\n[ ]*')
re_inline_italics = re.compile(r'{\\(?:em|it) (' + acceptable_chars + ')}')
re_italics = re.compile(r'\\(?:emph|textit){(' + acceptable_chars + ')}')
re_inline_sc = re.compile(r'{\\sc (' + acceptable_chars + ')}')
re_textsc = re.compile(r'\\textsc{(' + acceptable_chars + ')}')
re_inline_bf = re.compile(r'{\\bf (' + acceptable_chars + ')}')
re_textbf = re.compile(r'\\textbf{(' + acceptable_chars + ')}')
re_textrm = re.compile(r'\\textrm{(' + acceptable_chars + ')}')
re_url = re.compile(r'\\url{(' + acceptable_chars + ')}')
re_footnote = re.compile(r'\\footnote{(' + acceptable_chars + ')}')
re_mathmode = re.compile(r'\$(.*?)\$')
re_cite = re.compile(r'~?\\cite[pt]?{(' + acceptable_chars + ')}')
re_multi_space = re.compile(r'\s+')
re_superscript = re.compile(r'\\textsuperscript{(\d+)}')
re_subscript = re.compile(r'\\textsubscript{(\d+)}')
direct_replacements = {'\%': '%', '\&': '&', '$\sim$': '~', '\\alpha': 'ɑ', '\\beta': 'β', '\\gamma': 'ɣ', '\\propto': '∝', '\\Rightarrow': '⇒', '\\Leftrightarrow': '⇔', '\\Leftarrow': '⇐'}
subscript_map = {'0': '₀', '1': '₁', '2': '₂', '3': '₃', '4': '₄', '5': '₅', '6': '₆', '7': '₇', '8': '₈', '9': '₉'}
superscript_map = {'0': '⁰', '1': '¹', '2': '²', '3': '³', '4': '⁴', '5': '⁵', '6': '⁶', '7': '⁷', '8': '⁸', '9': '⁹'}
def convert_superscript_match(match):
return ''.join([superscript_map[char] for char in str(match.group(1))])
def convert_subscript_match(match):
return ''.join([subscript_map[char] for char in str(match.group(1))])
def clean_abstract(abstract):
for source, dest in direct_replacements.items():
abstract = abstract.replace(source, dest)
abstract = re_newline.sub(' ', abstract)
abstract = re_superscript.sub(convert_superscript_match, abstract)
abstract = re_subscript.sub(convert_subscript_match, abstract)
abstract = re_textsc.sub(r'\1', abstract)
abstract = re_inline_sc.sub(r'\1', abstract)
abstract = re_textrm.sub(r'\1', abstract)
abstract = re_textbf.sub(r'\1', abstract)
abstract = re_inline_bf.sub(r'\1', abstract)
abstract = re_cite.sub(' ', abstract)
abstract = re_url.sub(r'\1', abstract)
abstract = re_footnote.sub(r' (\1)', abstract)
abstract = re_mathmode.sub(r'\1', abstract)
abstract = re_inline_italics.sub(r'\1', abstract)
abstract = re_italics.sub(r'\1', abstract)
abstract = re_multi_space.sub(' ', abstract)
return abstract
def clean_title(paper_title):
for source, dest in direct_replacements.items():
paper_title = paper_title.replace(source, dest)
paper_title = re_curly_brace.sub(r'\1', paper_title)
return paper_title
def miniconf_join_list(lst):
return '|'.join(lst)
def parse_authors(author_string):
return re_author_split.split(author_string)
def main():
filename = 'ACL2020 Accepted Papers Information (to share with other chairs) - Sheet1.csv'
papers = pd.read_csv(filename)
track_filename = 'paper_tracks.xls'
track_details = pd.read_excel(track_filename)
assert set(papers['Submission ID'].values) == set(track_details['ID'].values)
papers = papers.merge(right=track_details, left_on='Submission ID', right_on='ID')
acl_id_stub = '1.'
papers['authors'] = papers['Authors'].apply(lambda x: miniconf_join_list(parse_authors(x)))
papers['Abstract'] = papers['Abstract'].apply(clean_abstract)
papers['title'] = papers['title'].apply(clean_title)
papers['UID'] = papers['Line order'].apply(lambda x: acl_id_stub + str(x))
papers['paper_type'] = papers['Submission Type']
papers.rename(columns={'Abstract': 'abstract', 'track': 'session'}, inplace=True)
papers['keywords'] = ''
papers = papers.loc[:, ['Line order', 'UID', 'title', 'authors', 'abstract', 'keywords', 'session', 'paper_type']]
papers.sort_values(by='Line order', axis=0, inplace=True)
papers.drop(columns='Line order', inplace=True)
papers.to_csv('papers.csv', index=False)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment