smdshakeelhassan/prepare_farm_doc_class_data.py

## prepare_farm_doc_class_data.py
import os
import pandas as pd #Use !pip install pandas if you don't have this package installed already
from cleantext import clean #Use !pip install clean-text if you don't have this package installed already

data_path = "./bbc/bbc"
label_dirs = next(os.walk(data_path))[1]

df = pd.DataFrame(columns=["text", "label"])

for label in label_dirs:
    files = next(os.walk(os.path.join(data_path,label)))[2]
    for file in files:
        with open(os.path.join(data_path,label,file), "r") as f:
            text = f.read()
        f.close()
        text = clean(text, fix_unicode=True, to_ascii=True, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True, no_punct=True,
                replace_with_punct="", replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="",
                replace_with_currency_symbol="", lang="en")
        data = {"text": text, "label": label}
        df = df.append(data, ignore_index=True)

train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)
train_df = train_df.reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

train_df.to_csv("data_doc_class/train.tsv", sep="\t", index=False)
test_df.to_csv("data_doc_class/test.tsv", sep="\t", index=False)
	import os
	import pandas as pd #Use !pip install pandas if you don't have this package installed already
	from cleantext import clean #Use !pip install clean-text if you don't have this package installed already

	data_path = "./bbc/bbc"
	label_dirs = next(os.walk(data_path))[1]

	df = pd.DataFrame(columns=["text", "label"])

	for label in label_dirs:
	files = next(os.walk(os.path.join(data_path,label)))[2]
	for file in files:
	with open(os.path.join(data_path,label,file), "r") as f:
	text = f.read()
	f.close()
	text = clean(text, fix_unicode=True, to_ascii=True, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True, no_punct=True,
	replace_with_punct="", replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="",
	replace_with_currency_symbol="", lang="en")
	data = {"text": text, "label": label}
	df = df.append(data, ignore_index=True)

	train_df = df.sample(frac=0.8, random_state=42)
	test_df = df.drop(train_df.index)
	train_df = train_df.reset_index(drop=True)
	test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

	train_df.to_csv("data_doc_class/train.tsv", sep="\t", index=False)
	test_df.to_csv("data_doc_class/test.tsv", sep="\t", index=False)