root2116/analyze.py Secret

## analyze.py
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.seasonal import seasonal_decompose
import numpy as np
import matplotlib.dates as mdates


def TablePlot(df, outputPath, w, h, decimals=4):
    format_str = "{:." + str(decimals) + "f}"
    formatted_df = df.applymap(
        lambda x: format_str.format(x) if isinstance(x, float) else x
    )

    fig, ax = plt.subplots(figsize=(w, h))
    ax.axis("off")
    ax.table(
        cellText=formatted_df.values,
        colLabels=formatted_df.columns,
        loc="center",
        bbox=[0, 0, 1, 1],
    )
    plt.savefig(outputPath)


def kpss_test(timeseries):
    print("Results of KPSS Test:")
    result = kpss(timeseries, regression="c", nlags="auto")

    print("KPSS Statistic: %f" % result[0])
    print("p-value: %f" % result[1])
    print("Critical Values:")
    for key, value in result[3].items():
        print("\t%s: %.3f" % (key, value))

    return result[1]


def adf_test(timeseries, reg="c"):
    result = adfuller(timeseries, regression=reg)

    # 結果の表示
    print("Results of Dickey-Fuller Test:")
    print("ADF Statistic: %f" % result[0])
    print("p-value: %f" % result[1])
    print("Critical Values:")
    for key, value in result[4].items():
        print("\t%s: %.3f" % (key, value))

    return result[1]


def analyze(df, title_suffix):
    df["datetime"] = pd.to_datetime(df["datetime"], format="%Y-%m-%dT%H:%M:%SZ")
    df["year_month"] = df["datetime"].dt.to_period("M")
    df["text_length"] = df["text"].str.replace(r"\s+", "", regex=True).str.len()
    monthly_data = df.groupby("year_month").agg(
        {"illeism": "sum", "first_person": "sum", "text_length": "sum"}
    )

    start_month = df["year_month"].min()
    end_month = df["year_month"].max()
    all_months = pd.period_range(start_month, end_month, freq="M")
    monthly_data = monthly_data.reset_index()

    # Create a DataFrame from all_months
    all_months_df = pd.DataFrame(all_months, columns=["year_month"])

    # Merge the complete range with the aggregated data
    merged_data = pd.merge(all_months_df, monthly_data, on="year_month", how="left")

    def fill_with_neighbors(row, df):
        if (
            pd.isna(row["illeism"])
            or pd.isna(row["first_person"])
            or pd.isna(row["text_length"])
        ):
            # Find the nearest non-NaN rows
            prev_non_nan = df.iloc[: row.name].dropna()
            next_non_nan = df.iloc[row.name + 1 :].dropna()

            # Identify indices of previous and next non-NaN rows
            prev_index = prev_non_nan.index.max() if not prev_non_nan.empty else None
            next_index = next_non_nan.index.min() if not next_non_nan.empty else None

            # Fetch previous and next rows
            prev_row = df.loc[prev_index] if prev_index is not None else None
            next_row = df.loc[next_index] if next_index is not None else None

            # Calculate average or use the nearest non-NaN row
            for column in ["illeism", "first_person", "text_length"]:
                if prev_row is not None and next_row is not None:
                    row[column] = (prev_row[column] + next_row[column]) / 2
                elif prev_row is not None:
                    row[column] = prev_row[column]
                elif next_row is not None:
                    row[column] = next_row[column]

        return row

    merged_data = merged_data.apply(
        lambda row: fill_with_neighbors(row, merged_data), axis=1
    )

    merged_data["illeism_per_1000"] = (
        1000 * merged_data["illeism"] / merged_data["text_length"]
    )
    merged_data["first_person_per_1000"] = (
        1000 * merged_data["first_person"] / merged_data["text_length"]
    )
    merged_data["illeism_ratio"] = np.where(
        (merged_data["illeism"] + merged_data["first_person"]) != 0,
        merged_data["illeism"] / (merged_data["illeism"] + merged_data["first_person"]),
        0,
    )

    merged_data["year_month_datetime"] = merged_data["year_month"].dt.to_timestamp()

    result = seasonal_decompose(
        merged_data["illeism_ratio"].dropna(), model="additive", period=12
    )

    # Prepare the figure with subplots
    fig, axes = plt.subplots(4, 1, figsize=(10, 8), sharex=True)

    # Plot the original data
    axes[0].plot(
        merged_data["year_month_datetime"],
        merged_data["illeism_ratio"],
        label="Original",
    )
    axes[0].set_title("Original Data")

    # Plot the trend component
    axes[1].plot(merged_data["year_month_datetime"], result.trend, label="Trend")
    axes[1].set_title("Trend")

    # Plot the seasonal component
    axes[2].plot(merged_data["year_month_datetime"], result.seasonal, label="Seasonal")
    axes[2].set_title("Seasonal")

    # Plot the residual component
    axes[3].scatter(merged_data["year_month_datetime"], result.resid, label="Residual")
    axes[3].axhline(y=0, color="k", linestyle="--")
    axes[3].set_title("Residual")

    # Set the x-axis as the year_month_datetime, rotate the labels for readability, and set the x-axis major and minor locators
    for ax in axes:
        # Set major and minor locators
        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
        ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))

    # Rotate the labels for the last subplot only
    plt.setp(axes[-1].get_xticklabels(), rotation=45, ha="right")

    fig.suptitle(f"Seasonal Decomposition for {title_suffix}")
    fig.tight_layout(
        rect=[0, 0.03, 1, 0.95]
    )  # Adjust the layout to make space for the suptitle

    # Save the figure
    plt.savefig(
        "seasonal_decompose_for_" + title_suffix.lower().replace(" ", "_") + ".png"
    )

    plt.figure(figsize=(10, 6))
    plt.plot(
        merged_data["year_month_datetime"], merged_data["illeism"], label="Illeism"
    )
    plt.plot(
        merged_data["year_month_datetime"],
        merged_data["first_person"],
        label="First Person",
    )
    plt.title(f"Monthly Total of Illeism and First Person for {title_suffix}")
    plt.xlabel("Year-Month")
    plt.ylabel("Total Count")
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(
        f"monthly_total_counts_for_{title_suffix.lower().replace(' ', '_')}.png"
    )

    # 単位文字数あたりの個数のラインチャートを描画して保存
    plt.figure(figsize=(10, 6))
    plt.plot(
        merged_data["year_month_datetime"],
        merged_data["illeism_per_1000"],
        label="Illeism per 1000 Characters",
    )
    plt.plot(
        merged_data["year_month_datetime"],
        merged_data["first_person_per_1000"],
        label="First Person per 1000 Characters",
    )
    plt.title(
        f"Monthly Illeism and First Person per 1000 Characters for {title_suffix}"
    )
    plt.xlabel("Year-Month")
    plt.ylabel("Count per 1000 Characters")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig(
        f"monthly_counts_per_1000_for_{title_suffix.lower().replace(' ', '_')}.png"
    )

    plt.figure(figsize=(10, 6))
    plt.plot(
        merged_data["year_month_datetime"],
        merged_data["illeism_ratio"],
        label="Illeism Ratio",
    )
    plt.title(f"Monthly Illeism Ratio for {title_suffix}")
    plt.xlabel("Year-Month")
    plt.ylabel("Ratio")
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig(
        "monthly_illeism_ratio_for_" + title_suffix.lower().replace(" ", "_") + ".png"
    )

    print(title_suffix)
    adf_p_value = adf_test(merged_data["illeism_ratio"].dropna(), reg="c")
    print()
    kpss_p_value = kpss_test(merged_data["illeism_ratio"].dropna())
    print()

    return adf_p_value, kpss_p_value, merged_data


if __name__ == "__main__":
    df = pd.read_csv("hiyotan_illeism_revised.csv")

    text_df = df[
        df["text"].notna()
        & ~df["filename"].fillna("").str.contains("\.mp4|\.m4a", regex=True)
    ]
    m4a_df = df[df["filename"].fillna("").str.contains("\.m4a")]
    mp4_df = df[df["filename"].fillna("").str.contains("\.mp4")]
    m4a_mp4_df = df[df["filename"].fillna("").str.contains("\.m4a|\.mp4", regex=True)]

    whole_adf_p, whole_kpss_p, df = analyze(df, "Whole Data")
    text_adf_p, text_kpss_p, text_df = analyze(text_df, "Text Data")
    m4a_adf_p, m4a_kpss_p, m4a_df = analyze(m4a_df, "Audio Data")
    mp4_adf_p, mp4_kpss_p, mp4_df = analyze(mp4_df, "Video Data")
    m4a_mp4_adf_p, m4a_mp4_kpss_p, m4a_mp4_df = analyze(
        m4a_mp4_df, "Audio and Video Data"
    )

    # save df
    df.to_csv("whole_data.csv", index=False)


    # Plotting and saving the graph for text_and m4a_mp4
    plt.figure(figsize=(10, 6))
    plt.plot(
        text_df["year_month_datetime"],
        text_df["illeism_ratio"],
        label="Text Illeism Ratio",
    )
    plt.plot(
        m4a_mp4_df["year_month_datetime"],
        m4a_mp4_df["illeism_ratio"],
        label="Audio and Video Illeism Ratio",
    )
    plt.title("Comparison of Text and Audio + Video Illeism Ratio")
    plt.xlabel("Year-Month")
    plt.ylabel("Illeism Ratio")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig("text_vs_audio_video_illeism_ratio.png")

    # Plotting and saving the graph for m4a and mp4
    plt.figure(figsize=(10, 6))
    plt.plot(
        m4a_df["year_month_datetime"],
        m4a_df["illeism_ratio"],
        label="Audio Illeism Ratio",
    )
    plt.plot(
        mp4_df["year_month_datetime"],
        mp4_df["illeism_ratio"],
        label="Video Illeism Ratio",
    )
    plt.title("Comparison of Audio and Video Illeism Ratio")
    plt.xlabel("Year-Month")
    plt.ylabel("Illeism Ratio")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig("audio_vs_video_illeism_ratio.png")

    # make new dataframe and add p-values into it
    df = pd.DataFrame(
        [
            ["Whole Data", whole_adf_p, whole_kpss_p],
            ["Text Data", text_adf_p, text_kpss_p],
            ["Audio Data", m4a_adf_p, m4a_kpss_p],
            ["Video Data", mp4_adf_p, mp4_kpss_p],
            ["Audio and Video Data", m4a_mp4_adf_p, m4a_mp4_kpss_p],
        ],
        columns=["Data", "ADF p-value", "KPSS p-value"],
    )

    TablePlot(df, "p_values.png", 10, 3)

## preprocessing.py
from openai import OpenAI
from dotenv import load_dotenv
import os
import pandas as pd
import re

load_dotenv()

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

if __name__ == "__main__":
    rtps = ["ひよたん", "ひより", "ひよこ", "ヒヨコ", "ヒヨタン", "ヒヨリ"]
    df = pd.read_csv("recent.csv")

    for index, row in df.iterrows():
        print("Processing row " + str(index) + "...")

        if not pd.isna(row["filename"]):
            if not row["filename"].endswith(".jpg"):
                filepath = "data/media/" + row["filename"]

                if os.path.exists(filepath.replace(".mp4", ".m4a")):
                    print("Transcribing " + filepath + "...")

                    file = open(filepath.replace(".mp4", ".m4a"), "rb")
                    transcription = client.audio.transcriptions.create(
                        model="whisper-1", file=file
                    )

                    decode_text = transcription.text
                    print(decode_text)
                    df.at[index, "text"] = decode_text


        if not pd.isna(df.at[index, "text"]):

            df.at[index, "illeism"] = str(len(re.findall(r"ひよたん|ひより|ひよこ|ヒヨタン|ヒヨリ|ヒヨコ", df.at[index, "text"])))
            df.at[index, "first_person"] = str(len(re.findall(r"私|わたし|ワタシ", df.at[index, "text"])))


    # save
    df.to_csv("recent_preprocessed.csv", index=False)
	import pandas as pd
	import matplotlib.pyplot as plt
	from statsmodels.tsa.stattools import adfuller, kpss
	from statsmodels.tsa.seasonal import seasonal_decompose
	import numpy as np
	import matplotlib.dates as mdates


	def TablePlot(df, outputPath, w, h, decimals=4):
	format_str = "{:." + str(decimals) + "f}"
	formatted_df = df.applymap(
	lambda x: format_str.format(x) if isinstance(x, float) else x
	)

	fig, ax = plt.subplots(figsize=(w, h))
	ax.axis("off")
	ax.table(
	cellText=formatted_df.values,
	colLabels=formatted_df.columns,
	loc="center",
	bbox=[0, 0, 1, 1],
	)
	plt.savefig(outputPath)


	def kpss_test(timeseries):
	print("Results of KPSS Test:")
	result = kpss(timeseries, regression="c", nlags="auto")

	print("KPSS Statistic: %f" % result[0])
	print("p-value: %f" % result[1])
	print("Critical Values:")
	for key, value in result[3].items():
	print("\t%s: %.3f" % (key, value))

	return result[1]


	def adf_test(timeseries, reg="c"):
	result = adfuller(timeseries, regression=reg)

	# 結果の表示
	print("Results of Dickey-Fuller Test:")
	print("ADF Statistic: %f" % result[0])
	print("p-value: %f" % result[1])
	print("Critical Values:")
	for key, value in result[4].items():
	print("\t%s: %.3f" % (key, value))

	return result[1]


	def analyze(df, title_suffix):
	df["datetime"] = pd.to_datetime(df["datetime"], format="%Y-%m-%dT%H:%M:%SZ")
	df["year_month"] = df["datetime"].dt.to_period("M")
	df["text_length"] = df["text"].str.replace(r"\s+", "", regex=True).str.len()
	monthly_data = df.groupby("year_month").agg(
	{"illeism": "sum", "first_person": "sum", "text_length": "sum"}
	)

	start_month = df["year_month"].min()
	end_month = df["year_month"].max()
	all_months = pd.period_range(start_month, end_month, freq="M")
	monthly_data = monthly_data.reset_index()

	# Create a DataFrame from all_months
	all_months_df = pd.DataFrame(all_months, columns=["year_month"])

	# Merge the complete range with the aggregated data
	merged_data = pd.merge(all_months_df, monthly_data, on="year_month", how="left")

	def fill_with_neighbors(row, df):
	if (
	pd.isna(row["illeism"])
	or pd.isna(row["first_person"])
	or pd.isna(row["text_length"])
	):
	# Find the nearest non-NaN rows
	prev_non_nan = df.iloc[: row.name].dropna()
	next_non_nan = df.iloc[row.name + 1 :].dropna()

	# Identify indices of previous and next non-NaN rows
	prev_index = prev_non_nan.index.max() if not prev_non_nan.empty else None
	next_index = next_non_nan.index.min() if not next_non_nan.empty else None

	# Fetch previous and next rows
	prev_row = df.loc[prev_index] if prev_index is not None else None
	next_row = df.loc[next_index] if next_index is not None else None

	# Calculate average or use the nearest non-NaN row
	for column in ["illeism", "first_person", "text_length"]:
	if prev_row is not None and next_row is not None:
	row[column] = (prev_row[column] + next_row[column]) / 2
	elif prev_row is not None:
	row[column] = prev_row[column]
	elif next_row is not None:
	row[column] = next_row[column]

	return row

	merged_data = merged_data.apply(
	lambda row: fill_with_neighbors(row, merged_data), axis=1
	)

	merged_data["illeism_per_1000"] = (
	1000 * merged_data["illeism"] / merged_data["text_length"]
	)
	merged_data["first_person_per_1000"] = (
	1000 * merged_data["first_person"] / merged_data["text_length"]
	)
	merged_data["illeism_ratio"] = np.where(
	(merged_data["illeism"] + merged_data["first_person"]) != 0,
	merged_data["illeism"] / (merged_data["illeism"] + merged_data["first_person"]),
	0,
	)

	merged_data["year_month_datetime"] = merged_data["year_month"].dt.to_timestamp()

	result = seasonal_decompose(
	merged_data["illeism_ratio"].dropna(), model="additive", period=12
	)

	# Prepare the figure with subplots
	fig, axes = plt.subplots(4, 1, figsize=(10, 8), sharex=True)

	# Plot the original data
	axes[0].plot(
	merged_data["year_month_datetime"],
	merged_data["illeism_ratio"],
	label="Original",
	)
	axes[0].set_title("Original Data")

	# Plot the trend component
	axes[1].plot(merged_data["year_month_datetime"], result.trend, label="Trend")
	axes[1].set_title("Trend")

	# Plot the seasonal component
	axes[2].plot(merged_data["year_month_datetime"], result.seasonal, label="Seasonal")
	axes[2].set_title("Seasonal")

	# Plot the residual component
	axes[3].scatter(merged_data["year_month_datetime"], result.resid, label="Residual")
	axes[3].axhline(y=0, color="k", linestyle="--")
	axes[3].set_title("Residual")

	# Set the x-axis as the year_month_datetime, rotate the labels for readability, and set the x-axis major and minor locators
	for ax in axes:
	# Set major and minor locators
	ax.xaxis.set_major_locator(mdates.AutoDateLocator())
	ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))

	# Rotate the labels for the last subplot only
	plt.setp(axes[-1].get_xticklabels(), rotation=45, ha="right")

	fig.suptitle(f"Seasonal Decomposition for {title_suffix}")
	fig.tight_layout(
	rect=[0, 0.03, 1, 0.95]
	) # Adjust the layout to make space for the suptitle

	# Save the figure
	plt.savefig(
	"seasonal_decompose_for_" + title_suffix.lower().replace(" ", "_") + ".png"
	)

	plt.figure(figsize=(10, 6))
	plt.plot(
	merged_data["year_month_datetime"], merged_data["illeism"], label="Illeism"
	)
	plt.plot(
	merged_data["year_month_datetime"],
	merged_data["first_person"],
	label="First Person",
	)
	plt.title(f"Monthly Total of Illeism and First Person for {title_suffix}")
	plt.xlabel("Year-Month")
	plt.ylabel("Total Count")
	plt.legend()
	plt.xticks(rotation=45)
	plt.tight_layout()
	plt.savefig(
	f"monthly_total_counts_for_{title_suffix.lower().replace(' ', '_')}.png"
	)

	# 単位文字数あたりの個数のラインチャートを描画して保存
	plt.figure(figsize=(10, 6))
	plt.plot(
	merged_data["year_month_datetime"],
	merged_data["illeism_per_1000"],
	label="Illeism per 1000 Characters",
	)
	plt.plot(
	merged_data["year_month_datetime"],
	merged_data["first_person_per_1000"],
	label="First Person per 1000 Characters",
	)
	plt.title(
	f"Monthly Illeism and First Person per 1000 Characters for {title_suffix}"
	)
	plt.xlabel("Year-Month")
	plt.ylabel("Count per 1000 Characters")
	plt.xticks(rotation=45)
	plt.legend()
	plt.tight_layout()
	plt.savefig(
	f"monthly_counts_per_1000_for_{title_suffix.lower().replace(' ', '_')}.png"
	)

	plt.figure(figsize=(10, 6))
	plt.plot(
	merged_data["year_month_datetime"],
	merged_data["illeism_ratio"],
	label="Illeism Ratio",
	)
	plt.title(f"Monthly Illeism Ratio for {title_suffix}")
	plt.xlabel("Year-Month")
	plt.ylabel("Ratio")
	plt.ylim(0, 1)
	plt.xticks(rotation=45)
	plt.legend()
	plt.tight_layout()
	plt.savefig(
	"monthly_illeism_ratio_for_" + title_suffix.lower().replace(" ", "_") + ".png"
	)

	print(title_suffix)
	adf_p_value = adf_test(merged_data["illeism_ratio"].dropna(), reg="c")
	print()
	kpss_p_value = kpss_test(merged_data["illeism_ratio"].dropna())
	print()

	return adf_p_value, kpss_p_value, merged_data


	if __name__ == "__main__":
	df = pd.read_csv("hiyotan_illeism_revised.csv")

	text_df = df[
	df["text"].notna()
	& ~df["filename"].fillna("").str.contains("\.mp4\|\.m4a", regex=True)
	]
	m4a_df = df[df["filename"].fillna("").str.contains("\.m4a")]
	mp4_df = df[df["filename"].fillna("").str.contains("\.mp4")]
	m4a_mp4_df = df[df["filename"].fillna("").str.contains("\.m4a\|\.mp4", regex=True)]

	whole_adf_p, whole_kpss_p, df = analyze(df, "Whole Data")
	text_adf_p, text_kpss_p, text_df = analyze(text_df, "Text Data")
	m4a_adf_p, m4a_kpss_p, m4a_df = analyze(m4a_df, "Audio Data")
	mp4_adf_p, mp4_kpss_p, mp4_df = analyze(mp4_df, "Video Data")
	m4a_mp4_adf_p, m4a_mp4_kpss_p, m4a_mp4_df = analyze(
	m4a_mp4_df, "Audio and Video Data"
	)

	# save df
	df.to_csv("whole_data.csv", index=False)



	# Plotting and saving the graph for text_and m4a_mp4
	plt.figure(figsize=(10, 6))
	plt.plot(
	text_df["year_month_datetime"],
	text_df["illeism_ratio"],
	label="Text Illeism Ratio",
	)
	plt.plot(
	m4a_mp4_df["year_month_datetime"],
	m4a_mp4_df["illeism_ratio"],
	label="Audio and Video Illeism Ratio",
	)
	plt.title("Comparison of Text and Audio + Video Illeism Ratio")
	plt.xlabel("Year-Month")
	plt.ylabel("Illeism Ratio")
	plt.xticks(rotation=45)
	plt.legend()
	plt.tight_layout()
	plt.savefig("text_vs_audio_video_illeism_ratio.png")

	# Plotting and saving the graph for m4a and mp4
	plt.figure(figsize=(10, 6))
	plt.plot(
	m4a_df["year_month_datetime"],
	m4a_df["illeism_ratio"],
	label="Audio Illeism Ratio",
	)
	plt.plot(
	mp4_df["year_month_datetime"],
	mp4_df["illeism_ratio"],
	label="Video Illeism Ratio",
	)
	plt.title("Comparison of Audio and Video Illeism Ratio")
	plt.xlabel("Year-Month")
	plt.ylabel("Illeism Ratio")
	plt.xticks(rotation=45)
	plt.legend()
	plt.tight_layout()
	plt.savefig("audio_vs_video_illeism_ratio.png")

	# make new dataframe and add p-values into it
	df = pd.DataFrame(
	[
	["Whole Data", whole_adf_p, whole_kpss_p],
	["Text Data", text_adf_p, text_kpss_p],
	["Audio Data", m4a_adf_p, m4a_kpss_p],
	["Video Data", mp4_adf_p, mp4_kpss_p],
	["Audio and Video Data", m4a_mp4_adf_p, m4a_mp4_kpss_p],
	],
	columns=["Data", "ADF p-value", "KPSS p-value"],
	)

	TablePlot(df, "p_values.png", 10, 3)
	from openai import OpenAI
	from dotenv import load_dotenv
	import os
	import pandas as pd
	import re

	load_dotenv()

	client = OpenAI(
	api_key=os.getenv("OPENAI_API_KEY"),
	)

	if __name__ == "__main__":
	rtps = ["ひよたん", "ひより", "ひよこ", "ヒヨコ", "ヒヨタン", "ヒヨリ"]
	df = pd.read_csv("recent.csv")

	for index, row in df.iterrows():
	print("Processing row " + str(index) + "...")

	if not pd.isna(row["filename"]):
	if not row["filename"].endswith(".jpg"):
	filepath = "data/media/" + row["filename"]

	if os.path.exists(filepath.replace(".mp4", ".m4a")):
	print("Transcribing " + filepath + "...")

	file = open(filepath.replace(".mp4", ".m4a"), "rb")
	transcription = client.audio.transcriptions.create(
	model="whisper-1", file=file
	)

	decode_text = transcription.text
	print(decode_text)
	df.at[index, "text"] = decode_text



	if not pd.isna(df.at[index, "text"]):

	df.at[index, "illeism"] = str(len(re.findall(r"ひよたん\|ひより\|ひよこ\|ヒヨタン\|ヒヨリ\|ヒヨコ", df.at[index, "text"])))
	df.at[index, "first_person"] = str(len(re.findall(r"私\|わたし\|ワタシ", df.at[index, "text"])))



	# save
	df.to_csv("recent_preprocessed.csv", index=False)