mys721tx/admin_actions.py

## admin_actions.py
# %%
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import umap

# %%
# specify the file path
file_path = "adminstats-zh.wikipedia.org-2023-02-21-2024-02-20.tsv"

# read the tsv file
data = pd.read_csv(file_path, sep="\t")

# discard the columns "#" and "总计"
data = data.drop(columns=["#", "总计"])

# use the column "用户名" as index
data = data.set_index("用户名")

# print the updated data
print(data)

# %%
# total count normalization
data_normalized = data / data.sum() * 10e4
# normalize the data
data_normalized = np.log1p(data)

# run PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(data_normalized)

# convert to dataframe
principalDf = pd.DataFrame(data=principalComponents, columns=["PC1", "PC2"])

# calculate the correlation of all principle components with the original columns
correlation_matrix = pd.DataFrame(
    data=pca.components_, columns=data.columns, index=["PC1", "PC2"]
).transpose()

# print the correlation matrix
print(correlation_matrix)

# %%
# plot the PC1 and PC2
plt.figure(figsize=(10, 10))
plt.scatter(principalDf["PC1"], principalDf["PC2"])

# add labels
for i, username in enumerate(data.index):
    plt.text(principalDf.loc[i, "PC1"], principalDf.loc[i, "PC2"], username)

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("2 component PCA")
plt.show()

# %%
# apply log1p normalization

# run UMAP
reducer = umap.UMAP(n_neighbors=6, n_components=2)
embedding = reducer.fit_transform(data_normalized)

# convert to dataframe
embeddingDf = pd.DataFrame(data=embedding, columns=["UMAP1", "UMAP2"])

# plot the UMAP1 and UMAP2
plt.figure(figsize=(10, 10))
plt.scatter(embeddingDf["UMAP1"], embeddingDf["UMAP2"])

# add labels
for i, username in enumerate(data.index):
    plt.text(embeddingDf.loc[i, "UMAP1"], embeddingDf.loc[i, "UMAP2"], username)

plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.title("2D UMAP")
plt.show()

# %%
	# %%
	import pandas as pd
	import numpy as np

	from sklearn.decomposition import PCA
	import matplotlib.pyplot as plt
	import umap

	# %%
	# specify the file path
	file_path = "adminstats-zh.wikipedia.org-2023-02-21-2024-02-20.tsv"

	# read the tsv file
	data = pd.read_csv(file_path, sep="\t")

	# discard the columns "#" and "总计"
	data = data.drop(columns=["#", "总计"])

	# use the column "用户名" as index
	data = data.set_index("用户名")

	# print the updated data
	print(data)

	# %%
	# total count normalization
	data_normalized = data / data.sum() * 10e4
	# normalize the data
	data_normalized = np.log1p(data)

	# run PCA
	pca = PCA(n_components=2)
	principalComponents = pca.fit_transform(data_normalized)

	# convert to dataframe
	principalDf = pd.DataFrame(data=principalComponents, columns=["PC1", "PC2"])

	# calculate the correlation of all principle components with the original columns
	correlation_matrix = pd.DataFrame(
	data=pca.components_, columns=data.columns, index=["PC1", "PC2"]
	).transpose()

	# print the correlation matrix
	print(correlation_matrix)

	# %%
	# plot the PC1 and PC2
	plt.figure(figsize=(10, 10))
	plt.scatter(principalDf["PC1"], principalDf["PC2"])

	# add labels
	for i, username in enumerate(data.index):
	plt.text(principalDf.loc[i, "PC1"], principalDf.loc[i, "PC2"], username)

	plt.xlabel("PC1")
	plt.ylabel("PC2")
	plt.title("2 component PCA")
	plt.show()

	# %%
	# apply log1p normalization

	# run UMAP
	reducer = umap.UMAP(n_neighbors=6, n_components=2)
	embedding = reducer.fit_transform(data_normalized)

	# convert to dataframe
	embeddingDf = pd.DataFrame(data=embedding, columns=["UMAP1", "UMAP2"])

	# plot the UMAP1 and UMAP2
	plt.figure(figsize=(10, 10))
	plt.scatter(embeddingDf["UMAP1"], embeddingDf["UMAP2"])

	# add labels
	for i, username in enumerate(data.index):
	plt.text(embeddingDf.loc[i, "UMAP1"], embeddingDf.loc[i, "UMAP2"], username)

	plt.xlabel("UMAP1")
	plt.ylabel("UMAP2")
	plt.title("2D UMAP")
	plt.show()

	# %%