Skip to content

Instantly share code, notes, and snippets.

@manojjha
Created January 15, 2024 12:21
Show Gist options
  • Save manojjha/92a9054565ae0c3e8b4c928ac92bb6f5 to your computer and use it in GitHub Desktop.
Save manojjha/92a9054565ae0c3e8b4c928ac92bb6f5 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "2ecd8fe7",
"metadata": {},
"source": [
"- Run Zookeeper\n",
" - .\\bin\\windows\\zookeeper-server-start.bat .\\config\\zookeeper.properties\n",
"\n",
"- Start Kafka Server\n",
" - .\\bin\\windows\\kafka-server-start.bat .\\config\\server.properties\n",
"\n",
"- Create Topics \n",
" - .\\bin\\windows\\kafka-topics.bat --create --topic test --bootstrap-server localhost:9092\n",
"\n",
"- Consume Topics \n",
" - .\\bin\\windows\\kafka-console-consumer.bat --topic test --from-beginning --bootstrap-server localhost:9092\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b1648c95-7755-4fae-928f-f22188fcfd1d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from kafka import KafkaConsumer, KafkaProducer\n",
"from time import sleep\n",
"from json import dumps\n",
"import json\n",
"from kafka import KafkaProducer\n",
"from json import dumps\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "03e92df1",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score\n",
"import re\n",
"from nltk.corpus import stopwords\n",
"\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import nltk\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.preprocessing import LabelBinarizer\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem.porter import PorterStemmer\n",
"from wordcloud import WordCloud,STOPWORDS\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.tokenize import word_tokenize,sent_tokenize\n",
"from bs4 import BeautifulSoup\n",
"import spacy\n",
"import re,string,unicodedata\n",
"from nltk.tokenize.toktok import ToktokTokenizer\n",
"from nltk.stem import LancasterStemmer,WordNetLemmatizer\n",
"from sklearn.linear_model import LogisticRegression,SGDClassifier\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.svm import SVC\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
"from sklearn.svm import SVC\n",
"from sklearn.metrics import confusion_matrix\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"from sklearn.decomposition import LatentDirichletAllocation\n",
"\n",
"from textblob import TextBlob\n",
"from textblob import Word\n",
"from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "19c6f52b-ffd1-4d5a-8165-5f59504fd1b6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"producer = KafkaProducer(bootstrap_servers=['localhost:9092'],\n",
" value_serializer=lambda x: dumps(x).encode('utf-8'))\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "85451085-2db9-4117-8107-e13c37733172",
"metadata": {},
"outputs": [],
"source": [
"#producer.send('demo_test', value=\"{'name': 'Consumer is was working well'}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3b65f243",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One of the other reviewers has mentioned that ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A wonderful little production. &lt;br /&gt;&lt;br /&gt;The...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I thought this was a wonderful way to spend ti...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically there's a family where a little boy ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production. <br /><br />The... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#df = pd.read_csv(\"indexProcessed.csv\")\n",
"IMDB_df = pd.read_csv('https://raw.githubusercontent.com/manojjha/BITS_Assignments/main/NLP/IMDB%20Dataset.csv?token=GHSAT0AAAAAACJXQMXDNSKYL6LUDAUK5SISZMICSXQ')\n",
"IMDB_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f8c73c93",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'while True:\\n dict_stock = df.sample(1).to_dict(orient=\"records\")[0]\\n producer.send(\\'test\\', value=dict_stock)\\n '"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''while True:\n",
" dict_stock = df.sample(1).to_dict(orient=\"records\")[0]\n",
" producer.send('test', value=dict_stock)\n",
" '''"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "41164a98",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 50000 entries, 0 to 49999\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 review 50000 non-null object\n",
" 1 sentiment 50000 non-null object\n",
"dtypes: object(2)\n",
"memory usage: 781.4+ KB\n"
]
}
],
"source": [
"IMDB_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ea54b5dc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\manoj\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"import nltk\n",
"nltk.download('stopwords')\n",
"tokenizer = ToktokTokenizer()\n",
"\n",
"stopword_list = nltk.corpus.stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "29a5e47c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\manoj\\AppData\\Local\\Temp\\ipykernel_27052\\3752247428.py:4: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
" soup = BeautifulSoup(text, 'html.parser')\n"
]
}
],
"source": [
"#Removing noise text\n",
"\n",
"def strip_html(text):\n",
" soup = BeautifulSoup(text, 'html.parser')\n",
" return soup.get_text()\n",
"\n",
"def remove_between_square_brackets(text):\n",
" return re.sub('\\[[^]]*\\]', '', text)\n",
"\n",
"def denoise_text(text):\n",
" text = strip_html(text)\n",
" text = remove_between_square_brackets(text)\n",
" return text\n",
"\n",
"IMDB_df['review'] = IMDB_df['review'].apply(denoise_text)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "58def824",
"metadata": {},
"outputs": [],
"source": [
"#remove special characters\n",
"\n",
"def remove_special_characters(text, remove_digits=True):\n",
" pattern = r'[^a-zA-Z0-9\\s]'\n",
" text = re.sub(pattern,'', text)\n",
" return text\n",
"\n",
"IMDB_df['review'] = IMDB_df['review'].apply(remove_special_characters)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "01df39fb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'such', 'do', 'ain', 'won', 'hasn', 'her', 'those', 'should', 'it', 'own', 're', \"doesn't\", 'through', \"it's\", 'i', \"couldn't\", 'they', 'why', 'again', 'what', 'not', 'each', 'in', 'just', 'didn', 'our', 'or', 'from', 't', 'of', 'their', 'only', 'most', 'are', 'an', 'weren', \"didn't\", 'hers', 'the', \"wasn't\", 'no', 'to', 'is', 'was', 'm', 'about', 'this', 'did', \"you'll\", \"you'd\", 'whom', 'during', 'have', 'myself', 'needn', 'been', 'that', 've', \"won't\", \"should've\", 'now', 'while', 'how', 'out', \"hadn't\", 'and', 'being', 'with', 'when', \"you've\", 'shouldn', 'more', 'does', 'itself', 'ours', 'were', 'aren', 'for', 'until', \"needn't\", 'some', 'him', 'theirs', 'few', 'above', 'am', 'mustn', 'my', 'had', \"aren't\", 'then', \"mustn't\", 'up', \"shouldn't\", \"don't\", \"wouldn't\", 'but', 'he', 'which', 'its', 'because', 'very', 'isn', 'once', 'his', 'yourselves', 'into', 'over', \"weren't\", 's', 'at', 'if', 'she', 'after', 'there', 'off', 'further', 'by', 'mightn', 'me', 'as', 'o', \"that'll\", \"isn't\", 'doing', \"mightn't\", 'wouldn', 'between', 'be', 'any', \"hasn't\", \"you're\", 'ourselves', 'against', 'too', 'so', 'y', 'under', \"haven't\", 'can', 'hadn', 'will', 'nor', 'your', 'than', 'who', 'down', 'before', 'same', 'you', 'them', \"she's\", 'd', 'we', 'both', 'on', 'll', 'shan', \"shan't\", 'below', 'themselves', 'other', 'doesn', 'here', 'yours', 'yourself', 'ma', 'where', 'wasn', 'has', 'herself', 'couldn', 'a', 'don', 'all', 'having', 'himself', 'these', 'haven'}\n"
]
}
],
"source": [
"# Removing stopwords\n",
"\n",
"stop = set(stopwords.words('english'))\n",
"print(stop)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "17c8fbcd",
"metadata": {},
"outputs": [],
"source": [
"def remove_stopwords(text, is_lower_case = False):\n",
" tokens = tokenizer.tokenize(text)\n",
" tokens = [token.strip() for token in tokens]\n",
" if is_lower_case:\n",
" filtered_tokens = [token for token in tokens if token not in stopword_list ]\n",
"\n",
" else:\n",
" filtered_tokens = [token for token in tokens if token.lower() not in stopword_list ]\n",
"\n",
" filtered_text = ' '.join(filtered_tokens)\n",
" return filtered_text\n",
"\n",
"IMDB_df['review'] = IMDB_df['review'].apply(remove_stopwords)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ba01b347",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One reviewers mentioned watching 1 Oz episode ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>wonderful little production filming technique ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>thought wonderful way spend time hot summer we...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically theres family little boy Jake thinks...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Matteis Love Time Money visually stunni...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review sentiment\n",
"0 One reviewers mentioned watching 1 Oz episode ... positive\n",
"1 wonderful little production filming technique ... positive\n",
"2 thought wonderful way spend time hot summer we... positive\n",
"3 Basically theres family little boy Jake thinks... negative\n",
"4 Petter Matteis Love Time Money visually stunni... positive"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"IMDB_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "9352a2c2",
"metadata": {},
"outputs": [],
"source": [
"# text stemming\n",
"\n",
"def simple_stemmer(text):\n",
" ps = nltk.porter.PorterStemmer()\n",
" text = ' '.join([ps.stem(word) for word in text.split()])\n",
" return text\n",
"\n",
"IMDB_df['review'] = IMDB_df['review'].apply(simple_stemmer)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "b7f3594a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" <th>cleaned_review</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>one review mention watch 1 oz episod youll hoo...</td>\n",
" <td>positive</td>\n",
" <td>one review mention watch 1 oz episod youll hoo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>wonder littl product film techniqu unassum old...</td>\n",
" <td>positive</td>\n",
" <td>wonder littl product film techniqu unassum old...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>thought wonder way spend time hot summer weeke...</td>\n",
" <td>positive</td>\n",
" <td>thought wonder way spend time hot summer weeke...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>basic there famili littl boy jake think there ...</td>\n",
" <td>negative</td>\n",
" <td>basic there famili littl boy jake think there ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>petter mattei love time money visual stun film...</td>\n",
" <td>positive</td>\n",
" <td>petter mattei love time money visual stun film...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review sentiment \\\n",
"0 one review mention watch 1 oz episod youll hoo... positive \n",
"1 wonder littl product film techniqu unassum old... positive \n",
"2 thought wonder way spend time hot summer weeke... positive \n",
"3 basic there famili littl boy jake think there ... negative \n",
"4 petter mattei love time money visual stun film... positive \n",
"\n",
" cleaned_review \n",
"0 one review mention watch 1 oz episod youll hoo... \n",
"1 wonder littl product film techniqu unassum old... \n",
"2 thought wonder way spend time hot summer weeke... \n",
"3 basic there famili littl boy jake think there ... \n",
"4 petter mattei love time money visual stun film... "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"IMDB_df['cleaned_review'] = IMDB_df['review']\n",
"IMDB_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "96e291d6",
"metadata": {},
"outputs": [],
"source": [
"# Create X and y objects\n",
"X = IMDB_df['cleaned_review']\n",
"y = IMDB_df['sentiment']"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "45a91519",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 1 1 ... 0 0 0]\n"
]
}
],
"source": [
"label_encoder = LabelEncoder()\n",
"\n",
"y_encoded = label_encoder.fit_transform(y)\n",
"\n",
"print(y_encoded)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "70945425",
"metadata": {},
"outputs": [],
"source": [
"tfidf_vectorizer = TfidfVectorizer(max_features=5000)\n",
"\n",
"X_tfidf = tfidf_vectorizer.fit_transform(X)\n",
"\n",
"X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "markdown",
"id": "5281247e",
"metadata": {},
"source": [
"#### Train Support Vector machine model"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "8cf43a20",
"metadata": {},
"outputs": [],
"source": [
"# Initialize and train the SVM model\n",
"svm_tfidf = SVC(kernel='linear', random_state=42)\n",
"svm_tfidf.fit(X_train_tfidf, y_train_tfidf)\n",
"\n",
"y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "32866b2b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion Matrix (TFIDF):\n",
"[[4312 649]\n",
" [ 509 4530]]\n"
]
}
],
"source": [
"conf_matrix_tfidf = confusion_matrix(y_test_tfidf, y_pred_tfidf)\n",
"print(\"Confusion Matrix (TFIDF):\")\n",
"print(conf_matrix_tfidf)\n",
"\n",
"#accuracy = svm_tfidf.score(X_train_tfidf, y_train_tfidf)\n",
"#print(\"Training accuracy:\", accuracy * 100, \"%\")\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "f1f74fc6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method Series.to_dict of 0 one review mention watch 1 oz episod youll hoo...\n",
"1 wonder littl product film techniqu unassum old...\n",
"2 thought wonder way spend time hot summer weeke...\n",
"3 basic there famili littl boy jake think there ...\n",
"4 petter mattei love time money visual stun film...\n",
" ... \n",
"49995 thought movi right good job wasnt creativ orig...\n",
"49996 bad plot bad dialogu bad act idiot direct anno...\n",
"49997 cathol taught parochi elementari school nun ta...\n",
"49998 im go disagre previou comment side maltin one ...\n",
"49999 one expect star trek movi high art fan expect ...\n",
"Name: review, Length: 50000, dtype: object>\n"
]
}
],
"source": [
"print(IMDB_df['review'].to_dict)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "333d564c",
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\manoj\\Documents\\Python\\DataEng\\DownloadPro\\Real-time-stock-analysis\\KafkaProducer-SVM_Producer1.ipynb Cell 24\u001b[0m line \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m message \u001b[39m=\u001b[39m {\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39mreview\u001b[39m\u001b[39m'\u001b[39m: review_text,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39mpredicted_sentiment\u001b[39m\u001b[39m'\u001b[39m: predicted_sentiment\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m }\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=17'>18</a>\u001b[0m producer\u001b[39m.\u001b[39msend(\u001b[39m'\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m'\u001b[39m, value\u001b[39m=\u001b[39mmessage)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=18'>19</a>\u001b[0m time\u001b[39m.\u001b[39msleep(\u001b[39m5\u001b[39m)\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"import time\n",
"\n",
"for index, row in IMDB_df.iterrows():\n",
" review_text = row['review']\n",
" cleaned_text = simple_stemmer(review_text)\n",
" \n",
" new_text_tfidf = tfidf_vectorizer.transform([cleaned_text])\n",
" \n",
" prediction = svm_tfidf.predict(new_text_tfidf)\n",
" \n",
" predicted_sentiment = label_encoder.inverse_transform(prediction)[0]\n",
" \n",
" message = {\n",
" 'review': review_text,\n",
" 'predicted_sentiment': predicted_sentiment\n",
" }\n",
"\n",
" producer.send('test', value=message)\n",
" time.sleep(5)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd3b35eb",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment