Skip to content

Instantly share code, notes, and snippets.

@drew2a
Last active January 10, 2024 10:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save drew2a/b05141a13c8d0c85c041714bba44b2d3 to your computer and use it in GitHub Desktop.
Save drew2a/b05141a13c8d0c85c041714bba44b2d3 to your computer and use it in GitHub Desktop.
Plot contributors for repository history
from collections import defaultdict
from datetime import datetime, timedelta
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from git import Repo
from matplotlib.ticker import MaxNLocator
# Function to read and parse the .mailmap file
def parse_mailmap(repo_path):
mailmap = {}
try:
with open(repo_path + '/.mailmap', 'r') as file:
for line in file:
canonical_name = ''
for part in [n for n in line.split('>') if n != '\n' and n]:
name, email = part.split('<')
name = name.strip()
if not canonical_name:
canonical_name = name
mailmap[email] = canonical_name
except FileNotFoundError:
print(".mailmap file not found, continuing without it")
return mailmap
# Function to process contributors' continuous activity periods
def process_activity_periods(contributors, window, granularity=7):
"""
Processes the activity periods of contributors based on commit dates with a specified granularity.
This function groups the commit dates of each contributor into continuous
activity periods. A continuous activity period is defined as a time
range where the gap between any two consecutive commits does not exceed
the specified 'window' duration.
Args:
contributors (dict): A dictionary where keys are contributor names and values are sets of commit dates.
window (timedelta): The maximum allowed gap between consecutive commits to be considered part of the same activity period.
granularity (int): The minimum number of days to consider for an activity period. If an activity period is shorter than this, it is extended to match the granularity.
Returns:
dict: A dictionary where keys are contributor names and values are lists of tuples. Each tuple represents an activity period (start_date, end_date).
"""
activity_periods = defaultdict(list)
for contributor, dates in contributors.items():
sorted_dates = sorted(list(dates))
start_date = sorted_dates[0]
last_date = start_date
for date in sorted_dates[1:]:
if date - last_date > window:
# Adjust the period length based on the granularity
actual_length = (last_date - start_date).days + 1
period_length = max(actual_length, granularity)
adjusted_end_date = start_date + timedelta(days=period_length)
activity_periods[contributor].append((start_date, adjusted_end_date, actual_length))
start_date = date
last_date = date
actual_length = (last_date - start_date).days + 1
period_length = max(actual_length, granularity)
adjusted_end_date = start_date + timedelta(days=period_length)
# Append the last period
activity_periods[contributor].append((start_date, adjusted_end_date, actual_length))
return activity_periods
# Function to count contributors and their activity dates with a minimum contribution filter
def count_contributors(repo_path, branch, mailmap, exclusions, delta, window, granularity):
repo = Repo(repo_path)
contributors = defaultdict(set)
if delta:
since = datetime.now() - delta
commits = list(repo.iter_commits(branch, since=since.strftime('%Y-%m-%d')))
else:
commits = list(repo.iter_commits(branch))
total_commits = len(commits)
print(f"Analyzing {total_commits} commits...")
# Iterating over commits in the specified branch
for i, commit in enumerate(commits, start=1):
commit_date = commit.committed_datetime.date()
contributor = commit.author.email
# Using mailmap to resolve duplicate contributors
contributor = mailmap.get(contributor, commit.author.name)
# Check if the contributor is not in the exclusions list
if not any(exclusion in contributor for exclusion in exclusions):
contributors[contributor].add(commit_date)
# Print progress every 10% of the total commits
if i % (total_commits // 10) == 0 or i == total_commits:
print(f"Processed {i}/{total_commits} commits ({(i / total_commits) * 100:.1f}%)")
return process_activity_periods(contributors, window, granularity)
# Function to plot the data
def plot_contributors(activity_periods, less_than_year=False):
plt.figure(figsize=(10, 8))
color_map = plt.cm.get_cmap('hsv', len(activity_periods) + 1)
y_labels = []
y_ticks = []
min_date = min(period[0] for periods in activity_periods.values() for period in periods)
max_date = max(period[1] for periods in activity_periods.values() for period in periods)
# Drawing bars for activity periods
for i, (contributor, periods) in enumerate(activity_periods.items()):
print(f"Contributor: {contributor}")
y_labels.append(contributor)
y_ticks.append(i)
for start_date, end_date, duration in periods:
print(f" Activity period: {start_date} to {end_date}. Duration: {duration}")
plt.barh(i, end_date - start_date, left=start_date, height=0.4, color=color_map(i))
# Adding horizontal lines for better readability
plt.axhline(i, color='gray', linestyle='--', linewidth=0.5)
# Adding gray vertical lines at the start of each year and labeling them
current_year = min_date.year
year_positions = []
while current_year <= max_date.year:
year_start = datetime(current_year, 1, 1)
plt.axvline(year_start, color='gray', linestyle='-', linewidth=0.8)
year_positions.append(year_start)
current_year += 1
plt.yticks(y_ticks, y_labels)
plt.xlabel('Date')
plt.ylabel('Contributors')
plt.title('Continuous Contribution Periods of Contributors')
if less_than_year:
# Setting less frequent date ticks on x-axis and adding year labels
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
else:
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.xticks(year_positions,
[datetime(year, 1, 1).strftime('%Y') for year in range(min_date.year, max_date.year + 1)])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
def aggregate_contributors_by_time(activity_periods):
unique_dates = set()
end_dates = set()
for periods in activity_periods.values():
for start_date, end_date, _ in periods:
unique_dates.add(start_date)
unique_dates.add(end_date)
end_dates.add(end_date)
sorted_dates = sorted(unique_dates)
contributor_count_by_date = defaultdict(int)
for date in sorted_dates:
for periods in activity_periods.values():
for start_date, end_date, _ in periods:
if start_date <= date <= end_date:
contributor_count_by_date[date] += 1
if date in end_dates:
contributor_count_by_date[date] -= 1
return sorted(contributor_count_by_date.items())
def plot_contributor_count_over_time(aggregated_data):
"""
Plots the number of contributors over time using a bar chart.
Args:
aggregated_data: A list of tuples, where each tuple contains a date and the corresponding number of contributors on that date.
"""
dates, counts = zip(*aggregated_data) # Unzip the date and count tuples
plt.figure(figsize=(20, 6))
for i in range(len(dates) - 1):
start_date = dates[i]
end_date = dates[i + 1]
count = counts[i]
width = (end_date - start_date).days
plt.bar(start_date, count, width=width, align='edge', edgecolor='black')
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.xticks(rotation=45)
# Set y-axis to have a step size of 1
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlabel('Date')
plt.ylabel('Number of Contributors')
plt.title('Number of Contributors Over Time')
plt.grid(True)
plt.tight_layout()
plt.show()
# Path to the specified repository
repo_path = 'tribler'
branch = 'main'
exclusions = ["dependabot", "snyk", "tribler-ci"]
# Parsing the .mailmap file
mailmap = parse_mailmap(repo_path)
# Analyzing the repository for continuous contribution periods
activity_periods = count_contributors(repo_path, branch, mailmap, exclusions, delta=timedelta(days=365 * 100),
window=timedelta(days=90), granularity=30)
contribution_duration = 90
activity_periods = dict(
(c, p) for c, p in activity_periods.items() if sum(d for sd, ed, d in p) >= contribution_duration)
# Plotting the contribution activity on the graph
plot_contributors(activity_periods, less_than_year=False)
# Aggregate contributor data by time
aggregated_data = aggregate_contributors_by_time(activity_periods)
# Plotting the number of contributors over time
plot_contributor_count_over_time(aggregated_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment