Skip to content

Instantly share code, notes, and snippets.

@ben0it8
Last active July 18, 2019 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ben0it8/f351f0a28acff2c7798c6ce59a9d8569 to your computer and use it in GitHub Desktop.
Save ben0it8/f351f0a28acff2c7798c6ce59a9d8569 to your computer and use it in GitHub Desktop.
read and clean imdb data
import pandas as pd
import re
# text and label column names
TEXT_COL = "text"
LABEL_COL = "label"
def clean_html(text: str):
"remove html tags and whitespaces"
cleanr = re.compile('<.*?>')
text = re.sub(cleanr, ' ', text)
return re.sub(' +', ' ', text)
def read_imdb(data_dir, max_lengths={"train": None, "test": None}):
datasets = {}
for t in ["train", "test"]:
df = pd.read_csv(os.path.join(data_dir, f"imdb5k_{t}.csv"))
if max_lengths.get(t) is not None:
df = df.sample(n=max_lengths.get(t))
df[TEXT_COL] = df[TEXT_COL].apply(lambda t: clean_html(t))
datasets[t] = df
return datasets
# read data
datasets = read_imdb(IMDB_DIR)
# list of labels
labels = list(set(datasets["train"][LABEL_COL].tolist()))
# labels to integers mapping
label2int = {label: i for i, label in enumerate(labels)}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment