Skip to content

Instantly share code, notes, and snippets.

@Aditya1001001
Last active February 24, 2022 12:53
Show Gist options
  • Save Aditya1001001/78137609b2887e4171de332f9a6d96e6 to your computer and use it in GitHub Desktop.
Save Aditya1001001/78137609b2887e4171de332f9a6d96e6 to your computer and use it in GitHub Desktop.
for i in range(len(data)):
for key in data[i].keys():
data[i][key] = data[i][key]["S"]
print(data[:2])
#drop columns
df = df[['topic','title']]
#drop 'NATION' and 'WORLD" labels
data = df[df['topic'] != 'NATION' or df['topic'] != 'WORLD']
counts = data['topic'].value_counts()
counts.plot(kind='bar', legend=False, grid=True, figsize=(8, 5))
df = pd.DataFrame(data)
print(df.head())
for topic in topics:
temp_df = data[data['topic'] == topic][:5000]
df = pd.concat([df, temp_df])
df['vector'] = df['title'].apply(lambda x: nlp(x).vector)
import numpy as np
lens = data.title.str.len()
lens.hist(bins = np.arange(0,200,5))
import
data = []
with open('data.json', 'r') as f:
data = f.readlines()
data = [json.loads(item)['Item'] for item in data]
print(data[:2])
import pandas as pd
df = pd.read_json('data.json', lines=True)
print(df)
import spacy_sentence_bert
# load one of the models listed at https://github.com/MartinoMensio/spacy-sentence-bert/
nlp = spacy_sentence_bert.load_model('en_stsb_distilbert_base')
from sklearn.linear_model import LogisticRegression
logistic_clf = LogisticRegression()
logistic_clf.fit(X_train, y_train)
y_pred = logistic_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
## OUTPUT
## 0.8254545454545454
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=9, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
## OUTPUT
## 0.7471717171717172
print(data.isnull().sum())
## OUTPUT
## topic 0
## title 1
## dtype: int64
data = data.dropna()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['vector'].tolist(), df['topic'].tolist(), test_size=0.33, random_state=42)
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
## OUTPUT
## 0.8392929292929293
for headline, topic in zip(headlines, topics):
print(headline)
print(f"True Label: {topic}, Predicted Label: {clf.predict(nlp(headline).vector.reshape(1, -1))[0]} \n")
headlines = ["Scientists Figured Out How Much Exercise You Need to 'Offset' a Day of Sitting",
"Marlee Matlin On Her Career And 'CODA' — The Actor's Side – Deadline",
"Increasing mental health issues a symptom of Victoria's lockdown",
'Philippines polio outbreak over: UN',
"Sophie, Countess of Wessex opens up about menopause: ‘It's like somebody's just gone and taken your brain'",
'Bill Gates tells why he prefers Android mobile over iPhone',
"'A weight has been lifted' Edinburgh pubs and restaurants react to hospitality rules easing",
"Mysterious Signal Flashing From Galaxy's Core Baffle Scientists; Where Is There Source of This Radio Waves?",
"'Tears in their eyes': World erupts over All Blacks' beautiful Maradona tribute",
"'Packed in like sardines'"]
topics = ['SCIENCE', 'ENTERTAINMENT', 'HEALTH',
'HEALTH', 'ENTERTAINMENT', 'TECHNOLOGY', 'BUSINESS',
'SCIENCE', 'SPORTS', 'ENTERTAINMENT']
print(df.topic.unique())
## OUTPUT
## array(['ENTERTAINMENT', 'BUSINESS', 'NATION', 'SPORTS', 'WORLD',
## 'TECHNOLOGY', 'HEALTH', 'SCIENCE'], dtype=object)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment