Last active
January 10, 2024 10:40
-
-
Save drew2a/b05141a13c8d0c85c041714bba44b2d3 to your computer and use it in GitHub Desktop.
Plot contributors for repository history
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from datetime import datetime, timedelta | |
import matplotlib.dates as mdates | |
import matplotlib.pyplot as plt | |
from git import Repo | |
from matplotlib.ticker import MaxNLocator | |
# Function to read and parse the .mailmap file | |
def parse_mailmap(repo_path): | |
mailmap = {} | |
try: | |
with open(repo_path + '/.mailmap', 'r') as file: | |
for line in file: | |
canonical_name = '' | |
for part in [n for n in line.split('>') if n != '\n' and n]: | |
name, email = part.split('<') | |
name = name.strip() | |
if not canonical_name: | |
canonical_name = name | |
mailmap[email] = canonical_name | |
except FileNotFoundError: | |
print(".mailmap file not found, continuing without it") | |
return mailmap | |
# Function to process contributors' continuous activity periods | |
def process_activity_periods(contributors, window, granularity=7): | |
""" | |
Processes the activity periods of contributors based on commit dates with a specified granularity. | |
This function groups the commit dates of each contributor into continuous | |
activity periods. A continuous activity period is defined as a time | |
range where the gap between any two consecutive commits does not exceed | |
the specified 'window' duration. | |
Args: | |
contributors (dict): A dictionary where keys are contributor names and values are sets of commit dates. | |
window (timedelta): The maximum allowed gap between consecutive commits to be considered part of the same activity period. | |
granularity (int): The minimum number of days to consider for an activity period. If an activity period is shorter than this, it is extended to match the granularity. | |
Returns: | |
dict: A dictionary where keys are contributor names and values are lists of tuples. Each tuple represents an activity period (start_date, end_date). | |
""" | |
activity_periods = defaultdict(list) | |
for contributor, dates in contributors.items(): | |
sorted_dates = sorted(list(dates)) | |
start_date = sorted_dates[0] | |
last_date = start_date | |
for date in sorted_dates[1:]: | |
if date - last_date > window: | |
# Adjust the period length based on the granularity | |
actual_length = (last_date - start_date).days + 1 | |
period_length = max(actual_length, granularity) | |
adjusted_end_date = start_date + timedelta(days=period_length) | |
activity_periods[contributor].append((start_date, adjusted_end_date, actual_length)) | |
start_date = date | |
last_date = date | |
actual_length = (last_date - start_date).days + 1 | |
period_length = max(actual_length, granularity) | |
adjusted_end_date = start_date + timedelta(days=period_length) | |
# Append the last period | |
activity_periods[contributor].append((start_date, adjusted_end_date, actual_length)) | |
return activity_periods | |
# Function to count contributors and their activity dates with a minimum contribution filter | |
def count_contributors(repo_path, branch, mailmap, exclusions, delta, window, granularity): | |
repo = Repo(repo_path) | |
contributors = defaultdict(set) | |
if delta: | |
since = datetime.now() - delta | |
commits = list(repo.iter_commits(branch, since=since.strftime('%Y-%m-%d'))) | |
else: | |
commits = list(repo.iter_commits(branch)) | |
total_commits = len(commits) | |
print(f"Analyzing {total_commits} commits...") | |
# Iterating over commits in the specified branch | |
for i, commit in enumerate(commits, start=1): | |
commit_date = commit.committed_datetime.date() | |
contributor = commit.author.email | |
# Using mailmap to resolve duplicate contributors | |
contributor = mailmap.get(contributor, commit.author.name) | |
# Check if the contributor is not in the exclusions list | |
if not any(exclusion in contributor for exclusion in exclusions): | |
contributors[contributor].add(commit_date) | |
# Print progress every 10% of the total commits | |
if i % (total_commits // 10) == 0 or i == total_commits: | |
print(f"Processed {i}/{total_commits} commits ({(i / total_commits) * 100:.1f}%)") | |
return process_activity_periods(contributors, window, granularity) | |
# Function to plot the data | |
def plot_contributors(activity_periods, less_than_year=False): | |
plt.figure(figsize=(10, 8)) | |
color_map = plt.cm.get_cmap('hsv', len(activity_periods) + 1) | |
y_labels = [] | |
y_ticks = [] | |
min_date = min(period[0] for periods in activity_periods.values() for period in periods) | |
max_date = max(period[1] for periods in activity_periods.values() for period in periods) | |
# Drawing bars for activity periods | |
for i, (contributor, periods) in enumerate(activity_periods.items()): | |
print(f"Contributor: {contributor}") | |
y_labels.append(contributor) | |
y_ticks.append(i) | |
for start_date, end_date, duration in periods: | |
print(f" Activity period: {start_date} to {end_date}. Duration: {duration}") | |
plt.barh(i, end_date - start_date, left=start_date, height=0.4, color=color_map(i)) | |
# Adding horizontal lines for better readability | |
plt.axhline(i, color='gray', linestyle='--', linewidth=0.5) | |
# Adding gray vertical lines at the start of each year and labeling them | |
current_year = min_date.year | |
year_positions = [] | |
while current_year <= max_date.year: | |
year_start = datetime(current_year, 1, 1) | |
plt.axvline(year_start, color='gray', linestyle='-', linewidth=0.8) | |
year_positions.append(year_start) | |
current_year += 1 | |
plt.yticks(y_ticks, y_labels) | |
plt.xlabel('Date') | |
plt.ylabel('Contributors') | |
plt.title('Continuous Contribution Periods of Contributors') | |
if less_than_year: | |
# Setting less frequent date ticks on x-axis and adding year labels | |
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1)) | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m')) | |
else: | |
plt.gca().xaxis.set_major_locator(mdates.YearLocator()) | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y')) | |
plt.xticks(year_positions, | |
[datetime(year, 1, 1).strftime('%Y') for year in range(min_date.year, max_date.year + 1)]) | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
plt.show() | |
def aggregate_contributors_by_time(activity_periods): | |
unique_dates = set() | |
end_dates = set() | |
for periods in activity_periods.values(): | |
for start_date, end_date, _ in periods: | |
unique_dates.add(start_date) | |
unique_dates.add(end_date) | |
end_dates.add(end_date) | |
sorted_dates = sorted(unique_dates) | |
contributor_count_by_date = defaultdict(int) | |
for date in sorted_dates: | |
for periods in activity_periods.values(): | |
for start_date, end_date, _ in periods: | |
if start_date <= date <= end_date: | |
contributor_count_by_date[date] += 1 | |
if date in end_dates: | |
contributor_count_by_date[date] -= 1 | |
return sorted(contributor_count_by_date.items()) | |
def plot_contributor_count_over_time(aggregated_data): | |
""" | |
Plots the number of contributors over time using a bar chart. | |
Args: | |
aggregated_data: A list of tuples, where each tuple contains a date and the corresponding number of contributors on that date. | |
""" | |
dates, counts = zip(*aggregated_data) # Unzip the date and count tuples | |
plt.figure(figsize=(20, 6)) | |
for i in range(len(dates) - 1): | |
start_date = dates[i] | |
end_date = dates[i + 1] | |
count = counts[i] | |
width = (end_date - start_date).days | |
plt.bar(start_date, count, width=width, align='edge', edgecolor='black') | |
plt.gca().xaxis.set_major_locator(mdates.YearLocator()) | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y')) | |
plt.xticks(rotation=45) | |
# Set y-axis to have a step size of 1 | |
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True)) | |
plt.xlabel('Date') | |
plt.ylabel('Number of Contributors') | |
plt.title('Number of Contributors Over Time') | |
plt.grid(True) | |
plt.tight_layout() | |
plt.show() | |
# Path to the specified repository | |
repo_path = 'tribler' | |
branch = 'main' | |
exclusions = ["dependabot", "snyk", "tribler-ci"] | |
# Parsing the .mailmap file | |
mailmap = parse_mailmap(repo_path) | |
# Analyzing the repository for continuous contribution periods | |
activity_periods = count_contributors(repo_path, branch, mailmap, exclusions, delta=timedelta(days=365 * 100), | |
window=timedelta(days=90), granularity=30) | |
contribution_duration = 90 | |
activity_periods = dict( | |
(c, p) for c, p in activity_periods.items() if sum(d for sd, ed, d in p) >= contribution_duration) | |
# Plotting the contribution activity on the graph | |
plot_contributors(activity_periods, less_than_year=False) | |
# Aggregate contributor data by time | |
aggregated_data = aggregate_contributors_by_time(activity_periods) | |
# Plotting the number of contributors over time | |
plot_contributor_count_over_time(aggregated_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment