Aditya1001001/clean_dict.py

## clean_dict.py
for i in range(len(data)):
  for key in data[i].keys():
    data[i][key] = data[i][key]["S"]

print(data[:2])

## cleaned_data.py
#drop columns
df = df[['topic','title']]

#drop 'NATION' and 'WORLD" labels
data = df[df['topic'] != 'NATION' or df['topic'] != 'WORLD']

## count_labels_plot.py
counts = data['topic'].value_counts()
counts.plot(kind='bar', legend=False, grid=True, figsize=(8, 5))

## create_dataframe.py
df = pd.DataFrame(data)

print(df.head())

## create_subset.py
for topic in topics:
  temp_df = data[data['topic'] == topic][:5000]
  df = pd.concat([df, temp_df])

## create_vectors.py
df['vector'] = df['title'].apply(lambda x: nlp(x).vector)

## heading_length.py
import numpy as np

lens = data.title.str.len()
lens.hist(bins = np.arange(0,200,5))

## list_of_dict.py
import

data = []
with open('data.json', 'r') as f:
  data = f.readlines()

data = [json.loads(item)['Item'] for item in data]

print(data[:2])

## load_jsonl.py
import pandas as pd

df = pd.read_json('data.json', lines=True)
print(df)

## load_model.py
import spacy_sentence_bert

# load one of the models listed at https://github.com/MartinoMensio/spacy-sentence-bert/
nlp = spacy_sentence_bert.load_model('en_stsb_distilbert_base')

## logistic_regression.py
from sklearn.linear_model import LogisticRegression

logistic_clf = LogisticRegression()
logistic_clf.fit(X_train, y_train)

y_pred = logistic_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

## OUTPUT
## 0.8254545454545454

## random_forrest.py
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=9, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

## OUTPUT
## 0.7471717171717172

## remove_missing.py
print(data.isnull().sum())

## OUTPUT
## topic    0
## title    1
## dtype: int64

data = data.dropna()

## split_data.py
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['vector'].tolist(), df['topic'].tolist(), test_size=0.33, random_state=42)

## svm.py
from sklearn.svm import SVC

clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

## OUTPUT
## 0.8392929292929293

## test_classifier.py
for headline, topic in zip(headlines, topics):
  print(headline)
  print(f"True Label: {topic}, Predicted Label: {clf.predict(nlp(headline).vector.reshape(1, -1))[0]} \n")

## test_data.py
headlines = ["Scientists Figured Out How Much Exercise You Need to 'Offset' a Day of Sitting",
 "Marlee Matlin On Her Career And 'CODA' — The Actor's Side – Deadline",
 "Increasing mental health issues a symptom of Victoria's lockdown",
 'Philippines polio outbreak over: UN',
 "Sophie, Countess of Wessex opens up about menopause: ‘It's like somebody's just gone and taken your brain'",
 'Bill Gates tells why he prefers Android mobile over iPhone',
 "'A weight has been lifted' Edinburgh pubs and restaurants react to hospitality rules easing",
 "Mysterious Signal Flashing From Galaxy's Core Baffle Scientists; Where Is There Source of This Radio Waves?",
 "'Tears in their eyes': World erupts over All Blacks' beautiful Maradona tribute",
 "'Packed in like sardines'"]

topics = ['SCIENCE',  'ENTERTAINMENT',  'HEALTH',
 'HEALTH',  'ENTERTAINMENT',  'TECHNOLOGY',  'BUSINESS',
 'SCIENCE',  'SPORTS',  'ENTERTAINMENT']

## topics.py
print(df.topic.unique())

## OUTPUT
## array(['ENTERTAINMENT', 'BUSINESS', 'NATION', 'SPORTS', 'WORLD',
##     'TECHNOLOGY', 'HEALTH', 'SCIENCE'], dtype=object)
	for i in range(len(data)):
	for key in data[i].keys():
	data[i][key] = data[i][key]["S"]

	print(data[:2])
	#drop columns
	df = df[['topic','title']]

	#drop 'NATION' and 'WORLD" labels
	data = df[df['topic'] != 'NATION' or df['topic'] != 'WORLD']
	counts = data['topic'].value_counts()
	counts.plot(kind='bar', legend=False, grid=True, figsize=(8, 5))
	for topic in topics:
	temp_df = data[data['topic'] == topic][:5000]
	df = pd.concat([df, temp_df])
	import numpy as np

	lens = data.title.str.len()
	lens.hist(bins = np.arange(0,200,5))
	import

	data = []
	with open('data.json', 'r') as f:
	data = f.readlines()

	data = [json.loads(item)['Item'] for item in data]

	print(data[:2])
	import pandas as pd

	df = pd.read_json('data.json', lines=True)
	print(df)
	import spacy_sentence_bert

	# load one of the models listed at https://github.com/MartinoMensio/spacy-sentence-bert/
	nlp = spacy_sentence_bert.load_model('en_stsb_distilbert_base')
	from sklearn.linear_model import LogisticRegression

	logistic_clf = LogisticRegression()
	logistic_clf.fit(X_train, y_train)

	y_pred = logistic_clf.predict(X_test)
	print(accuracy_score(y_test, y_pred))

	## OUTPUT
	## 0.8254545454545454
	from sklearn.ensemble import RandomForestClassifier

	clf = RandomForestClassifier(max_depth=9, random_state=0)
	clf.fit(X_train, y_train)

	y_pred = clf.predict(X_test)
	print(accuracy_score(y_test, y_pred))

	## OUTPUT
	## 0.7471717171717172