Skip to content

Instantly share code, notes, and snippets.

@root2116
Created December 22, 2023 00:11
Show Gist options
  • Save root2116/9ee6f47fff7d81689cc86132fcddb87c to your computer and use it in GitHub Desktop.
Save root2116/9ee6f47fff7d81689cc86132fcddb87c to your computer and use it in GitHub Desktop.
hiyotan illeism research
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.seasonal import seasonal_decompose
import numpy as np
import matplotlib.dates as mdates
def TablePlot(df, outputPath, w, h, decimals=4):
format_str = "{:." + str(decimals) + "f}"
formatted_df = df.applymap(
lambda x: format_str.format(x) if isinstance(x, float) else x
)
fig, ax = plt.subplots(figsize=(w, h))
ax.axis("off")
ax.table(
cellText=formatted_df.values,
colLabels=formatted_df.columns,
loc="center",
bbox=[0, 0, 1, 1],
)
plt.savefig(outputPath)
def kpss_test(timeseries):
print("Results of KPSS Test:")
result = kpss(timeseries, regression="c", nlags="auto")
print("KPSS Statistic: %f" % result[0])
print("p-value: %f" % result[1])
print("Critical Values:")
for key, value in result[3].items():
print("\t%s: %.3f" % (key, value))
return result[1]
def adf_test(timeseries, reg="c"):
result = adfuller(timeseries, regression=reg)
# 結果の表示
print("Results of Dickey-Fuller Test:")
print("ADF Statistic: %f" % result[0])
print("p-value: %f" % result[1])
print("Critical Values:")
for key, value in result[4].items():
print("\t%s: %.3f" % (key, value))
return result[1]
def analyze(df, title_suffix):
df["datetime"] = pd.to_datetime(df["datetime"], format="%Y-%m-%dT%H:%M:%SZ")
df["year_month"] = df["datetime"].dt.to_period("M")
df["text_length"] = df["text"].str.replace(r"\s+", "", regex=True).str.len()
monthly_data = df.groupby("year_month").agg(
{"illeism": "sum", "first_person": "sum", "text_length": "sum"}
)
start_month = df["year_month"].min()
end_month = df["year_month"].max()
all_months = pd.period_range(start_month, end_month, freq="M")
monthly_data = monthly_data.reset_index()
# Create a DataFrame from all_months
all_months_df = pd.DataFrame(all_months, columns=["year_month"])
# Merge the complete range with the aggregated data
merged_data = pd.merge(all_months_df, monthly_data, on="year_month", how="left")
def fill_with_neighbors(row, df):
if (
pd.isna(row["illeism"])
or pd.isna(row["first_person"])
or pd.isna(row["text_length"])
):
# Find the nearest non-NaN rows
prev_non_nan = df.iloc[: row.name].dropna()
next_non_nan = df.iloc[row.name + 1 :].dropna()
# Identify indices of previous and next non-NaN rows
prev_index = prev_non_nan.index.max() if not prev_non_nan.empty else None
next_index = next_non_nan.index.min() if not next_non_nan.empty else None
# Fetch previous and next rows
prev_row = df.loc[prev_index] if prev_index is not None else None
next_row = df.loc[next_index] if next_index is not None else None
# Calculate average or use the nearest non-NaN row
for column in ["illeism", "first_person", "text_length"]:
if prev_row is not None and next_row is not None:
row[column] = (prev_row[column] + next_row[column]) / 2
elif prev_row is not None:
row[column] = prev_row[column]
elif next_row is not None:
row[column] = next_row[column]
return row
merged_data = merged_data.apply(
lambda row: fill_with_neighbors(row, merged_data), axis=1
)
merged_data["illeism_per_1000"] = (
1000 * merged_data["illeism"] / merged_data["text_length"]
)
merged_data["first_person_per_1000"] = (
1000 * merged_data["first_person"] / merged_data["text_length"]
)
merged_data["illeism_ratio"] = np.where(
(merged_data["illeism"] + merged_data["first_person"]) != 0,
merged_data["illeism"] / (merged_data["illeism"] + merged_data["first_person"]),
0,
)
merged_data["year_month_datetime"] = merged_data["year_month"].dt.to_timestamp()
result = seasonal_decompose(
merged_data["illeism_ratio"].dropna(), model="additive", period=12
)
# Prepare the figure with subplots
fig, axes = plt.subplots(4, 1, figsize=(10, 8), sharex=True)
# Plot the original data
axes[0].plot(
merged_data["year_month_datetime"],
merged_data["illeism_ratio"],
label="Original",
)
axes[0].set_title("Original Data")
# Plot the trend component
axes[1].plot(merged_data["year_month_datetime"], result.trend, label="Trend")
axes[1].set_title("Trend")
# Plot the seasonal component
axes[2].plot(merged_data["year_month_datetime"], result.seasonal, label="Seasonal")
axes[2].set_title("Seasonal")
# Plot the residual component
axes[3].scatter(merged_data["year_month_datetime"], result.resid, label="Residual")
axes[3].axhline(y=0, color="k", linestyle="--")
axes[3].set_title("Residual")
# Set the x-axis as the year_month_datetime, rotate the labels for readability, and set the x-axis major and minor locators
for ax in axes:
# Set major and minor locators
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
# Rotate the labels for the last subplot only
plt.setp(axes[-1].get_xticklabels(), rotation=45, ha="right")
fig.suptitle(f"Seasonal Decomposition for {title_suffix}")
fig.tight_layout(
rect=[0, 0.03, 1, 0.95]
) # Adjust the layout to make space for the suptitle
# Save the figure
plt.savefig(
"seasonal_decompose_for_" + title_suffix.lower().replace(" ", "_") + ".png"
)
plt.figure(figsize=(10, 6))
plt.plot(
merged_data["year_month_datetime"], merged_data["illeism"], label="Illeism"
)
plt.plot(
merged_data["year_month_datetime"],
merged_data["first_person"],
label="First Person",
)
plt.title(f"Monthly Total of Illeism and First Person for {title_suffix}")
plt.xlabel("Year-Month")
plt.ylabel("Total Count")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(
f"monthly_total_counts_for_{title_suffix.lower().replace(' ', '_')}.png"
)
# 単位文字数あたりの個数のラインチャートを描画して保存
plt.figure(figsize=(10, 6))
plt.plot(
merged_data["year_month_datetime"],
merged_data["illeism_per_1000"],
label="Illeism per 1000 Characters",
)
plt.plot(
merged_data["year_month_datetime"],
merged_data["first_person_per_1000"],
label="First Person per 1000 Characters",
)
plt.title(
f"Monthly Illeism and First Person per 1000 Characters for {title_suffix}"
)
plt.xlabel("Year-Month")
plt.ylabel("Count per 1000 Characters")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.savefig(
f"monthly_counts_per_1000_for_{title_suffix.lower().replace(' ', '_')}.png"
)
plt.figure(figsize=(10, 6))
plt.plot(
merged_data["year_month_datetime"],
merged_data["illeism_ratio"],
label="Illeism Ratio",
)
plt.title(f"Monthly Illeism Ratio for {title_suffix}")
plt.xlabel("Year-Month")
plt.ylabel("Ratio")
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.savefig(
"monthly_illeism_ratio_for_" + title_suffix.lower().replace(" ", "_") + ".png"
)
print(title_suffix)
adf_p_value = adf_test(merged_data["illeism_ratio"].dropna(), reg="c")
print()
kpss_p_value = kpss_test(merged_data["illeism_ratio"].dropna())
print()
return adf_p_value, kpss_p_value, merged_data
if __name__ == "__main__":
df = pd.read_csv("hiyotan_illeism_revised.csv")
text_df = df[
df["text"].notna()
& ~df["filename"].fillna("").str.contains("\.mp4|\.m4a", regex=True)
]
m4a_df = df[df["filename"].fillna("").str.contains("\.m4a")]
mp4_df = df[df["filename"].fillna("").str.contains("\.mp4")]
m4a_mp4_df = df[df["filename"].fillna("").str.contains("\.m4a|\.mp4", regex=True)]
whole_adf_p, whole_kpss_p, df = analyze(df, "Whole Data")
text_adf_p, text_kpss_p, text_df = analyze(text_df, "Text Data")
m4a_adf_p, m4a_kpss_p, m4a_df = analyze(m4a_df, "Audio Data")
mp4_adf_p, mp4_kpss_p, mp4_df = analyze(mp4_df, "Video Data")
m4a_mp4_adf_p, m4a_mp4_kpss_p, m4a_mp4_df = analyze(
m4a_mp4_df, "Audio and Video Data"
)
# save df
df.to_csv("whole_data.csv", index=False)
# Plotting and saving the graph for text_and m4a_mp4
plt.figure(figsize=(10, 6))
plt.plot(
text_df["year_month_datetime"],
text_df["illeism_ratio"],
label="Text Illeism Ratio",
)
plt.plot(
m4a_mp4_df["year_month_datetime"],
m4a_mp4_df["illeism_ratio"],
label="Audio and Video Illeism Ratio",
)
plt.title("Comparison of Text and Audio + Video Illeism Ratio")
plt.xlabel("Year-Month")
plt.ylabel("Illeism Ratio")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.savefig("text_vs_audio_video_illeism_ratio.png")
# Plotting and saving the graph for m4a and mp4
plt.figure(figsize=(10, 6))
plt.plot(
m4a_df["year_month_datetime"],
m4a_df["illeism_ratio"],
label="Audio Illeism Ratio",
)
plt.plot(
mp4_df["year_month_datetime"],
mp4_df["illeism_ratio"],
label="Video Illeism Ratio",
)
plt.title("Comparison of Audio and Video Illeism Ratio")
plt.xlabel("Year-Month")
plt.ylabel("Illeism Ratio")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.savefig("audio_vs_video_illeism_ratio.png")
# make new dataframe and add p-values into it
df = pd.DataFrame(
[
["Whole Data", whole_adf_p, whole_kpss_p],
["Text Data", text_adf_p, text_kpss_p],
["Audio Data", m4a_adf_p, m4a_kpss_p],
["Video Data", mp4_adf_p, mp4_kpss_p],
["Audio and Video Data", m4a_mp4_adf_p, m4a_mp4_kpss_p],
],
columns=["Data", "ADF p-value", "KPSS p-value"],
)
TablePlot(df, "p_values.png", 10, 3)
from openai import OpenAI
from dotenv import load_dotenv
import os
import pandas as pd
import re
load_dotenv()
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
)
if __name__ == "__main__":
rtps = ["ひよたん", "ひより", "ひよこ", "ヒヨコ", "ヒヨタン", "ヒヨリ"]
df = pd.read_csv("recent.csv")
for index, row in df.iterrows():
print("Processing row " + str(index) + "...")
if not pd.isna(row["filename"]):
if not row["filename"].endswith(".jpg"):
filepath = "data/media/" + row["filename"]
if os.path.exists(filepath.replace(".mp4", ".m4a")):
print("Transcribing " + filepath + "...")
file = open(filepath.replace(".mp4", ".m4a"), "rb")
transcription = client.audio.transcriptions.create(
model="whisper-1", file=file
)
decode_text = transcription.text
print(decode_text)
df.at[index, "text"] = decode_text
if not pd.isna(df.at[index, "text"]):
df.at[index, "illeism"] = str(len(re.findall(r"ひよたん|ひより|ひよこ|ヒヨタン|ヒヨリ|ヒヨコ", df.at[index, "text"])))
df.at[index, "first_person"] = str(len(re.findall(r"私|わたし|ワタシ", df.at[index, "text"])))
# save
df.to_csv("recent_preprocessed.csv", index=False)
@root2116
Copy link
Author

monthly_counts_per_1000_for_audio_and_video_data
monthly_counts_per_1000_for_audio_data
monthly_counts_per_1000_for_text_data
monthly_counts_per_1000_for_video_data
monthly_counts_per_1000_for_whole_data
monthly_illeism_ratio_for_audio_and_video_data
monthly_illeism_ratio_for_audio_data
monthly_illeism_ratio_for_text_data
monthly_illeism_ratio_for_video_data
monthly_illeism_ratio_for_whole_data
monthly_total_counts_for_audio_and_video_data
monthly_total_counts_for_audio_data
monthly_total_counts_for_text_data
monthly_total_counts_for_video_data
monthly_total_counts_for_whole_data
p_values
seasonal_decompose_for_audio_and_video_data
seasonal_decompose_for_audio_data
seasonal_decompose_for_text_data
seasonal_decompose_for_video_data
seasonal_decompose_for_whole_data
text_vs_audio_video_illeism_ratio
audio_vs_video_illeism_ratio

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment