Skip to content

Instantly share code, notes, and snippets.

View charlieoneill11's full-sized avatar

Charlie O'Neill charlieoneill11

View GitHub Profile
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False,
showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()
import matplotlib.pyplot as plt
df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()
def label_int2str(row):
return offensive["train"].features["label"].int2str(row)
df["label_name"] = df["label"].apply(label_int2str)
df.head()
import pandas as pd
offensive.set_format(type="pandas")
df = offensive["train"][:]
df.head()
train_ds[0]
> {'text': '@user Bono... who cares. Soon people will understand that they gain nothing from following a phony celebrity. Become a Leader of your people instead or help and support your fellow countrymen.',
'label': 0}
len(train_ds)
> 11916
train_ds[0]
> {'text': '@user Bono... who cares. Soon people will understand that they gain nothing from following a phony celebrity. Become a Leader of your people instead or help and support your fellow countrymen.',
'label': 0}
len(train_ds)
> 11916
train_ds = offensive["train"]
train_ds
> Dataset({
features: ['text', 'label'],
num_rows: 11916
})
offensive
> DatasetDict({
train: Dataset({
features: ['text', 'label'],
num_rows: 11916
})
test: Dataset({
features: ['text', 'label'],
num_rows: 860
from datasets import load_dataset
offensive = load_dataset("tweet_eval", "offensive")