Skip to content

Instantly share code, notes, and snippets.

@karrtikr
Created April 4, 2023 07:11
Show Gist options
  • Save karrtikr/a2e124b4c7f395b1b4e17ab7ee6bd1ab to your computer and use it in GitHub Desktop.
Save karrtikr/a2e124b4c7f395b1b4e17ab7ee6bd1ab to your computer and use it in GitHub Desktop.
import subprocess
import json
from collections import Counter
from nltk.corpus import stopwords
from nltk import pos_tag
from spellchecker import SpellChecker
# List of triagers whose comments will be included in the analysis
TRIAGERS = ["karrtikr","karthiknadig","paulacamargo25","eleanorjboyd", "brettcannon", "ericsnowcurrently", "DonJayamanne"]
# Get the list of issue numbers
result = subprocess.run(["gh", "issue", "list", "--state=closed", "--label=info-needed", "-R", "microsoft/vscode-python", "--json", "number"], capture_output=True)
if result.returncode != 0:
print(result.stderr.decode())
exit(1)
issues = json.loads(result.stdout.decode())
# Loop through the issue numbers and retrieve the second last comment for each issue
all_comments = []
for issue in issues:
result = subprocess.run(["gh", "issue", "view", str(issue["number"]), "--comments", "-R", "microsoft/vscode-python", "--json", "comments"], capture_output=True)
if result.returncode != 0:
print(result.stderr.decode())
exit(1)
data = json.loads(result.stdout.decode())
if "comments" not in data:
print(f"Error: comments not found in JSON output for issue {issue['number']}")
continue
triager_comments = []
for comment in data["comments"]:
if comment["author"]["login"] in TRIAGERS:
triager_comments.append(comment)
if len(triager_comments) >= 2:
comment = triager_comments[-2]
print(comment["body"])
print('--------------------')
all_comments.append(comment["body"])
# Ignore stop words, nouns, and pronouns, and count the frequency of the remaining words in the second last comment across all issues
stop_words = set(stopwords.words('english'))
spell = SpellChecker(language='en', distance=1)
all_words = []
for comment in all_comments:
words = comment.split()
if len(words) >= 2:
# filter out stop words, non-words, pronouns, and nouns
cleaned_words = [word for word, pos in pos_tag(words) if word not in stop_words and word.isalpha() and not spell.unknown([word]) and pos != 'NN' and pos != 'NNS' and pos != 'NNP' and pos != 'NNPS' and pos != 'PRP' and pos != 'PRP$']
all_words.extend(cleaned_words)
word_counts = Counter(all_words)
# Print the top 10 most frequent words
print("Top 10 most frequent words in the second last comment across all issues (excluding stop words, nouns, and pronouns):")
for word, count in word_counts.most_common(10):
print(f"{word}: {count} occurrences")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment