Skip to content

Instantly share code, notes, and snippets.

@jennynz
Created June 22, 2022 02:05
Show Gist options
  • Save jennynz/08e34c4cbec6d7436bc26f1647b63f5e to your computer and use it in GitHub Desktop.
Save jennynz/08e34c4cbec6d7436bc26f1647b63f5e to your computer and use it in GitHub Desktop.
Filtering out bots from GitHub data
# List of common bots on GitHub
# Doesn't include ones that would already be filtered out by the is_bot function
# but it won't hurt to also include them in here
GITHUB_BOTS = [
'netlify',
'linear-app',
'codeclimate',
'renovate',
'renovate-approve',
'github-actions',
'vercel',
'googlebot',
'codesandbox-ci',
'sizebot',
'tensorflow-jenkins',
'tensorflowbutler',
'google-ml-butler',
'google-cla',
'coveralls',
]
import pandas as pd
from typing import Optional
from bots import GITHUB_BOTS
def is_bot(
row: pd.Series,
author_col: str,
bot_col: Optional[str] = None,
bot_col_val: Optional[str] = None,
) -> bool:
# Check if the bot column actually says it's a bot
# e.g. pr_author_typename == 'Bot'
if (
(bot_col is not None)
and (row[bot_col] is not None)
and (row[bot_col] == bot_col_val)
):
return True
# If the bot column doesn't hold any evidence, rely on the author name
author = row[author_col]
return (
'-bot' in author
or '[bot]' in author
or 'dependabot' in author
or author in GITHUB_BOTS
)
import pandas as pd
from is_bot import is_bot
def test_is_bot_checks_author_col_for_mention_of_bot():
assert is_bot(pd.Series({'pr_author': 'bertie-bot'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'bertie-bott'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'bertie [bot]'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'bertie[bot]'}), author_col='pr_author')
assert is_bot(
pd.Series({'pr_author': 'facebook-github-bot'}), author_col='pr_author'
)
def test_is_bot_catches_all_types_of_dependabot():
assert is_bot(pd.Series({'pr_author': 'dependabot'}), author_col='pr_author')
assert is_bot(
pd.Series({'pr_author': 'dependabot-preview'}), author_col='pr_author'
)
assert is_bot(
pd.Series({'pr_author': 'dependabot-preview'}), author_col='pr_author'
)
def test_is_bot_catches_specific_bot_names():
assert is_bot(pd.Series({'pr_author': 'netlify'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'linear-app'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'codeclimate'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'renovate'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'renovate-approve'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'github-actions'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'vercel'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'googlebot'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'google-cla'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'codesandbox-ci'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'sizebot'}), author_col='pr_author')
assert is_bot(
pd.Series({'pr_author': 'tensorflow-jenkins'}), author_col='pr_author'
)
assert is_bot(pd.Series({'pr_author': 'tensorflowbutler'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'google-cla'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'google-ml-buter'}), author_col='pr_author')
assert is_bot(pd.Series({'pr_author': 'coveralls'}), author_col='pr_author')
def test_is_bot_does_not_filter_out_incidental_botlike_substrings():
assert not is_bot(pd.Series({'pr_author': 'hannah abbott'}), author_col='pr_author')
assert not is_bot(
pd.Series({'pr_author': 'i-like-to-renovate'}), author_col='pr_author'
)
def test_is_bot_prioritises_bot_col():
assert is_bot(
pd.Series(
{'pr_author': 'human being sounding name', 'pr_author_typename': 'Bot'}
),
author_col='pr_author',
bot_col='pr_author_typename',
bot_col_val='Bot',
)
# bot_col_val overrides the fact that the author name is not bot-like
assert is_bot(
pd.Series(
{'pr_author': 'human being sounding name', 'pr_author_typename': 'Bot'}
),
author_col='pr_author',
bot_col='pr_author_typename',
bot_col_val='Bot',
)
# bot_col_val is not correct
assert not is_bot(
pd.Series(
{'pr_author': 'human being sounding name', 'pr_author_typename': 'User'}
),
author_col='pr_author',
bot_col='pr_author_typename',
bot_col_val='Bot',
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment