-
-
Save root2116/9ee6f47fff7d81689cc86132fcddb87c to your computer and use it in GitHub Desktop.
hiyotan illeism research
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
from statsmodels.tsa.stattools import adfuller, kpss | |
from statsmodels.tsa.seasonal import seasonal_decompose | |
import numpy as np | |
import matplotlib.dates as mdates | |
def TablePlot(df, outputPath, w, h, decimals=4): | |
format_str = "{:." + str(decimals) + "f}" | |
formatted_df = df.applymap( | |
lambda x: format_str.format(x) if isinstance(x, float) else x | |
) | |
fig, ax = plt.subplots(figsize=(w, h)) | |
ax.axis("off") | |
ax.table( | |
cellText=formatted_df.values, | |
colLabels=formatted_df.columns, | |
loc="center", | |
bbox=[0, 0, 1, 1], | |
) | |
plt.savefig(outputPath) | |
def kpss_test(timeseries): | |
print("Results of KPSS Test:") | |
result = kpss(timeseries, regression="c", nlags="auto") | |
print("KPSS Statistic: %f" % result[0]) | |
print("p-value: %f" % result[1]) | |
print("Critical Values:") | |
for key, value in result[3].items(): | |
print("\t%s: %.3f" % (key, value)) | |
return result[1] | |
def adf_test(timeseries, reg="c"): | |
result = adfuller(timeseries, regression=reg) | |
# 結果の表示 | |
print("Results of Dickey-Fuller Test:") | |
print("ADF Statistic: %f" % result[0]) | |
print("p-value: %f" % result[1]) | |
print("Critical Values:") | |
for key, value in result[4].items(): | |
print("\t%s: %.3f" % (key, value)) | |
return result[1] | |
def analyze(df, title_suffix): | |
df["datetime"] = pd.to_datetime(df["datetime"], format="%Y-%m-%dT%H:%M:%SZ") | |
df["year_month"] = df["datetime"].dt.to_period("M") | |
df["text_length"] = df["text"].str.replace(r"\s+", "", regex=True).str.len() | |
monthly_data = df.groupby("year_month").agg( | |
{"illeism": "sum", "first_person": "sum", "text_length": "sum"} | |
) | |
start_month = df["year_month"].min() | |
end_month = df["year_month"].max() | |
all_months = pd.period_range(start_month, end_month, freq="M") | |
monthly_data = monthly_data.reset_index() | |
# Create a DataFrame from all_months | |
all_months_df = pd.DataFrame(all_months, columns=["year_month"]) | |
# Merge the complete range with the aggregated data | |
merged_data = pd.merge(all_months_df, monthly_data, on="year_month", how="left") | |
def fill_with_neighbors(row, df): | |
if ( | |
pd.isna(row["illeism"]) | |
or pd.isna(row["first_person"]) | |
or pd.isna(row["text_length"]) | |
): | |
# Find the nearest non-NaN rows | |
prev_non_nan = df.iloc[: row.name].dropna() | |
next_non_nan = df.iloc[row.name + 1 :].dropna() | |
# Identify indices of previous and next non-NaN rows | |
prev_index = prev_non_nan.index.max() if not prev_non_nan.empty else None | |
next_index = next_non_nan.index.min() if not next_non_nan.empty else None | |
# Fetch previous and next rows | |
prev_row = df.loc[prev_index] if prev_index is not None else None | |
next_row = df.loc[next_index] if next_index is not None else None | |
# Calculate average or use the nearest non-NaN row | |
for column in ["illeism", "first_person", "text_length"]: | |
if prev_row is not None and next_row is not None: | |
row[column] = (prev_row[column] + next_row[column]) / 2 | |
elif prev_row is not None: | |
row[column] = prev_row[column] | |
elif next_row is not None: | |
row[column] = next_row[column] | |
return row | |
merged_data = merged_data.apply( | |
lambda row: fill_with_neighbors(row, merged_data), axis=1 | |
) | |
merged_data["illeism_per_1000"] = ( | |
1000 * merged_data["illeism"] / merged_data["text_length"] | |
) | |
merged_data["first_person_per_1000"] = ( | |
1000 * merged_data["first_person"] / merged_data["text_length"] | |
) | |
merged_data["illeism_ratio"] = np.where( | |
(merged_data["illeism"] + merged_data["first_person"]) != 0, | |
merged_data["illeism"] / (merged_data["illeism"] + merged_data["first_person"]), | |
0, | |
) | |
merged_data["year_month_datetime"] = merged_data["year_month"].dt.to_timestamp() | |
result = seasonal_decompose( | |
merged_data["illeism_ratio"].dropna(), model="additive", period=12 | |
) | |
# Prepare the figure with subplots | |
fig, axes = plt.subplots(4, 1, figsize=(10, 8), sharex=True) | |
# Plot the original data | |
axes[0].plot( | |
merged_data["year_month_datetime"], | |
merged_data["illeism_ratio"], | |
label="Original", | |
) | |
axes[0].set_title("Original Data") | |
# Plot the trend component | |
axes[1].plot(merged_data["year_month_datetime"], result.trend, label="Trend") | |
axes[1].set_title("Trend") | |
# Plot the seasonal component | |
axes[2].plot(merged_data["year_month_datetime"], result.seasonal, label="Seasonal") | |
axes[2].set_title("Seasonal") | |
# Plot the residual component | |
axes[3].scatter(merged_data["year_month_datetime"], result.resid, label="Residual") | |
axes[3].axhline(y=0, color="k", linestyle="--") | |
axes[3].set_title("Residual") | |
# Set the x-axis as the year_month_datetime, rotate the labels for readability, and set the x-axis major and minor locators | |
for ax in axes: | |
# Set major and minor locators | |
ax.xaxis.set_major_locator(mdates.AutoDateLocator()) | |
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m")) | |
# Rotate the labels for the last subplot only | |
plt.setp(axes[-1].get_xticklabels(), rotation=45, ha="right") | |
fig.suptitle(f"Seasonal Decomposition for {title_suffix}") | |
fig.tight_layout( | |
rect=[0, 0.03, 1, 0.95] | |
) # Adjust the layout to make space for the suptitle | |
# Save the figure | |
plt.savefig( | |
"seasonal_decompose_for_" + title_suffix.lower().replace(" ", "_") + ".png" | |
) | |
plt.figure(figsize=(10, 6)) | |
plt.plot( | |
merged_data["year_month_datetime"], merged_data["illeism"], label="Illeism" | |
) | |
plt.plot( | |
merged_data["year_month_datetime"], | |
merged_data["first_person"], | |
label="First Person", | |
) | |
plt.title(f"Monthly Total of Illeism and First Person for {title_suffix}") | |
plt.xlabel("Year-Month") | |
plt.ylabel("Total Count") | |
plt.legend() | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
plt.savefig( | |
f"monthly_total_counts_for_{title_suffix.lower().replace(' ', '_')}.png" | |
) | |
# 単位文字数あたりの個数のラインチャートを描画して保存 | |
plt.figure(figsize=(10, 6)) | |
plt.plot( | |
merged_data["year_month_datetime"], | |
merged_data["illeism_per_1000"], | |
label="Illeism per 1000 Characters", | |
) | |
plt.plot( | |
merged_data["year_month_datetime"], | |
merged_data["first_person_per_1000"], | |
label="First Person per 1000 Characters", | |
) | |
plt.title( | |
f"Monthly Illeism and First Person per 1000 Characters for {title_suffix}" | |
) | |
plt.xlabel("Year-Month") | |
plt.ylabel("Count per 1000 Characters") | |
plt.xticks(rotation=45) | |
plt.legend() | |
plt.tight_layout() | |
plt.savefig( | |
f"monthly_counts_per_1000_for_{title_suffix.lower().replace(' ', '_')}.png" | |
) | |
plt.figure(figsize=(10, 6)) | |
plt.plot( | |
merged_data["year_month_datetime"], | |
merged_data["illeism_ratio"], | |
label="Illeism Ratio", | |
) | |
plt.title(f"Monthly Illeism Ratio for {title_suffix}") | |
plt.xlabel("Year-Month") | |
plt.ylabel("Ratio") | |
plt.ylim(0, 1) | |
plt.xticks(rotation=45) | |
plt.legend() | |
plt.tight_layout() | |
plt.savefig( | |
"monthly_illeism_ratio_for_" + title_suffix.lower().replace(" ", "_") + ".png" | |
) | |
print(title_suffix) | |
adf_p_value = adf_test(merged_data["illeism_ratio"].dropna(), reg="c") | |
print() | |
kpss_p_value = kpss_test(merged_data["illeism_ratio"].dropna()) | |
print() | |
return adf_p_value, kpss_p_value, merged_data | |
if __name__ == "__main__": | |
df = pd.read_csv("hiyotan_illeism_revised.csv") | |
text_df = df[ | |
df["text"].notna() | |
& ~df["filename"].fillna("").str.contains("\.mp4|\.m4a", regex=True) | |
] | |
m4a_df = df[df["filename"].fillna("").str.contains("\.m4a")] | |
mp4_df = df[df["filename"].fillna("").str.contains("\.mp4")] | |
m4a_mp4_df = df[df["filename"].fillna("").str.contains("\.m4a|\.mp4", regex=True)] | |
whole_adf_p, whole_kpss_p, df = analyze(df, "Whole Data") | |
text_adf_p, text_kpss_p, text_df = analyze(text_df, "Text Data") | |
m4a_adf_p, m4a_kpss_p, m4a_df = analyze(m4a_df, "Audio Data") | |
mp4_adf_p, mp4_kpss_p, mp4_df = analyze(mp4_df, "Video Data") | |
m4a_mp4_adf_p, m4a_mp4_kpss_p, m4a_mp4_df = analyze( | |
m4a_mp4_df, "Audio and Video Data" | |
) | |
# save df | |
df.to_csv("whole_data.csv", index=False) | |
# Plotting and saving the graph for text_and m4a_mp4 | |
plt.figure(figsize=(10, 6)) | |
plt.plot( | |
text_df["year_month_datetime"], | |
text_df["illeism_ratio"], | |
label="Text Illeism Ratio", | |
) | |
plt.plot( | |
m4a_mp4_df["year_month_datetime"], | |
m4a_mp4_df["illeism_ratio"], | |
label="Audio and Video Illeism Ratio", | |
) | |
plt.title("Comparison of Text and Audio + Video Illeism Ratio") | |
plt.xlabel("Year-Month") | |
plt.ylabel("Illeism Ratio") | |
plt.xticks(rotation=45) | |
plt.legend() | |
plt.tight_layout() | |
plt.savefig("text_vs_audio_video_illeism_ratio.png") | |
# Plotting and saving the graph for m4a and mp4 | |
plt.figure(figsize=(10, 6)) | |
plt.plot( | |
m4a_df["year_month_datetime"], | |
m4a_df["illeism_ratio"], | |
label="Audio Illeism Ratio", | |
) | |
plt.plot( | |
mp4_df["year_month_datetime"], | |
mp4_df["illeism_ratio"], | |
label="Video Illeism Ratio", | |
) | |
plt.title("Comparison of Audio and Video Illeism Ratio") | |
plt.xlabel("Year-Month") | |
plt.ylabel("Illeism Ratio") | |
plt.xticks(rotation=45) | |
plt.legend() | |
plt.tight_layout() | |
plt.savefig("audio_vs_video_illeism_ratio.png") | |
# make new dataframe and add p-values into it | |
df = pd.DataFrame( | |
[ | |
["Whole Data", whole_adf_p, whole_kpss_p], | |
["Text Data", text_adf_p, text_kpss_p], | |
["Audio Data", m4a_adf_p, m4a_kpss_p], | |
["Video Data", mp4_adf_p, mp4_kpss_p], | |
["Audio and Video Data", m4a_mp4_adf_p, m4a_mp4_kpss_p], | |
], | |
columns=["Data", "ADF p-value", "KPSS p-value"], | |
) | |
TablePlot(df, "p_values.png", 10, 3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from openai import OpenAI | |
from dotenv import load_dotenv | |
import os | |
import pandas as pd | |
import re | |
load_dotenv() | |
client = OpenAI( | |
api_key=os.getenv("OPENAI_API_KEY"), | |
) | |
if __name__ == "__main__": | |
rtps = ["ひよたん", "ひより", "ひよこ", "ヒヨコ", "ヒヨタン", "ヒヨリ"] | |
df = pd.read_csv("recent.csv") | |
for index, row in df.iterrows(): | |
print("Processing row " + str(index) + "...") | |
if not pd.isna(row["filename"]): | |
if not row["filename"].endswith(".jpg"): | |
filepath = "data/media/" + row["filename"] | |
if os.path.exists(filepath.replace(".mp4", ".m4a")): | |
print("Transcribing " + filepath + "...") | |
file = open(filepath.replace(".mp4", ".m4a"), "rb") | |
transcription = client.audio.transcriptions.create( | |
model="whisper-1", file=file | |
) | |
decode_text = transcription.text | |
print(decode_text) | |
df.at[index, "text"] = decode_text | |
if not pd.isna(df.at[index, "text"]): | |
df.at[index, "illeism"] = str(len(re.findall(r"ひよたん|ひより|ひよこ|ヒヨタン|ヒヨリ|ヒヨコ", df.at[index, "text"]))) | |
df.at[index, "first_person"] = str(len(re.findall(r"私|わたし|ワタシ", df.at[index, "text"]))) | |
# save | |
df.to_csv("recent_preprocessed.csv", index=False) |
Author
root2116
commented
Dec 22, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment