Skip to content

Instantly share code, notes, and snippets.

@VikParuchuri
Created August 29, 2023 19:42
Show Gist options
  • Save VikParuchuri/65ecb68d6d0d6e2b68d5ee7a66b97e81 to your computer and use it in GitHub Desktop.
Save VikParuchuri/65ecb68d6d0d6e2b68d5ee7a66b97e81 to your computer and use it in GitHub Desktop.
def code_filter(code):
"""
Filters out low quality training code, with an aim to improving benchmark performance. Works best with Python.
"""
# Python keywords
keywords = ["assert", "async", "await", "break", "class", "continue", "def", "del", "elif", "else", "except",
"finally", "for", "from", "global", "if", "import", "in", "is", "lambda", "nonlocal", "not", "or",
"pass", "raise", "return", "try", "while", "with", "yield"]
# This covers single line comments in python + other common languages
comment_symbols = ["#", "//"]
# Filter out todos
pattern = r'TODO.{0,3}(put|fill|implement|write|your)'
if re.search(pattern, code, flags=re.IGNORECASE):
return False
# Filter commented code
pattern = "|".join([re.escape(symbol) + '.{0,1}' + keyword + r'\s' for symbol in comment_symbols for keyword in keywords])
if re.search(pattern, code):
return False
# Filter out numbers after comment symbols
pattern = r'\#.{0,1}1\.\s|/.{0,1}1\.\s'
if re.search(pattern, code):
return False
# Empty comments
pattern = r'(#|//)\n'
if re.search(pattern, code):
return False
# Functions not implemented
if "\n pass" in code or "\n pass" in code or "NotImplemented" in code:
return False
# Ensure we return something in at least one line
pattern = r'^\s*return\s(?!None).+$'
if not re.search(pattern, code, re.MULTILINE):
return False
return True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment