Skip to content

Instantly share code, notes, and snippets.

@mys721tx
Created June 11, 2024 22:16
Show Gist options
  • Save mys721tx/60a283a76a5885442ebd9bc28ed7f64d to your computer and use it in GitHub Desktop.
Save mys721tx/60a283a76a5885442ebd9bc28ed7f64d to your computer and use it in GitHub Desktop.
Projecting Wikipedia admin actions to 2d
# %%
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import umap
# %%
# specify the file path
file_path = "adminstats-zh.wikipedia.org-2023-02-21-2024-02-20.tsv"
# read the tsv file
data = pd.read_csv(file_path, sep="\t")
# discard the columns "#" and "总计"
data = data.drop(columns=["#", "总计"])
# use the column "用户名" as index
data = data.set_index("用户名")
# print the updated data
print(data)
# %%
# total count normalization
data_normalized = data / data.sum() * 10e4
# normalize the data
data_normalized = np.log1p(data)
# run PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(data_normalized)
# convert to dataframe
principalDf = pd.DataFrame(data=principalComponents, columns=["PC1", "PC2"])
# calculate the correlation of all principle components with the original columns
correlation_matrix = pd.DataFrame(
data=pca.components_, columns=data.columns, index=["PC1", "PC2"]
).transpose()
# print the correlation matrix
print(correlation_matrix)
# %%
# plot the PC1 and PC2
plt.figure(figsize=(10, 10))
plt.scatter(principalDf["PC1"], principalDf["PC2"])
# add labels
for i, username in enumerate(data.index):
plt.text(principalDf.loc[i, "PC1"], principalDf.loc[i, "PC2"], username)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("2 component PCA")
plt.show()
# %%
# apply log1p normalization
# run UMAP
reducer = umap.UMAP(n_neighbors=6, n_components=2)
embedding = reducer.fit_transform(data_normalized)
# convert to dataframe
embeddingDf = pd.DataFrame(data=embedding, columns=["UMAP1", "UMAP2"])
# plot the UMAP1 and UMAP2
plt.figure(figsize=(10, 10))
plt.scatter(embeddingDf["UMAP1"], embeddingDf["UMAP2"])
# add labels
for i, username in enumerate(data.index):
plt.text(embeddingDf.loc[i, "UMAP1"], embeddingDf.loc[i, "UMAP2"], username)
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.title("2D UMAP")
plt.show()
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment