Skip to content

Instantly share code, notes, and snippets.

@smdshakeelhassan
Created July 28, 2021 05:06
Show Gist options
  • Save smdshakeelhassan/b943b669761e8e60dcf2d28c1e5f03c7 to your computer and use it in GitHub Desktop.
Save smdshakeelhassan/b943b669761e8e60dcf2d28c1e5f03c7 to your computer and use it in GitHub Desktop.
Medium- FARM- Document Classification- Dataset Preparation
import os
import pandas as pd #Use !pip install pandas if you don't have this package installed already
from cleantext import clean #Use !pip install clean-text if you don't have this package installed already
data_path = "./bbc/bbc"
label_dirs = next(os.walk(data_path))[1]
df = pd.DataFrame(columns=["text", "label"])
for label in label_dirs:
files = next(os.walk(os.path.join(data_path,label)))[2]
for file in files:
with open(os.path.join(data_path,label,file), "r") as f:
text = f.read()
f.close()
text = clean(text, fix_unicode=True, to_ascii=True, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True, no_punct=True,
replace_with_punct="", replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="",
replace_with_currency_symbol="", lang="en")
data = {"text": text, "label": label}
df = df.append(data, ignore_index=True)
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)
train_df = train_df.reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
train_df.to_csv("data_doc_class/train.tsv", sep="\t", index=False)
test_df.to_csv("data_doc_class/test.tsv", sep="\t", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment