manojjha/KafkaProducer-SVM_Producer1.ipynb

## KafkaProducer-SVM_Producer1.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2ecd8fe7",
   "metadata": {},
   "source": [
    "- Run Zookeeper\n",
    "    - .\\bin\\windows\\zookeeper-server-start.bat .\\config\\zookeeper.properties\n",
    "\n",
    "- Start Kafka Server\n",
    "    - .\\bin\\windows\\kafka-server-start.bat .\\config\\server.properties\n",
    "\n",
    "- Create Topics \n",
    "    - .\\bin\\windows\\kafka-topics.bat --create --topic test --bootstrap-server localhost:9092\n",
    "\n",
    "- Consume Topics \n",
    "    - .\\bin\\windows\\kafka-console-consumer.bat --topic test --from-beginning --bootstrap-server localhost:9092\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b1648c95-7755-4fae-928f-f22188fcfd1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from kafka import KafkaConsumer, KafkaProducer\n",
    "from time import sleep\n",
    "from json import dumps\n",
    "import json\n",
    "from kafka import KafkaProducer\n",
    "from json import dumps\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "03e92df1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score\n",
    "import re\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import nltk\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.preprocessing import LabelBinarizer\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem.porter import PorterStemmer\n",
    "from wordcloud import WordCloud,STOPWORDS\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.tokenize import word_tokenize,sent_tokenize\n",
    "from bs4 import BeautifulSoup\n",
    "import spacy\n",
    "import re,string,unicodedata\n",
    "from nltk.tokenize.toktok import ToktokTokenizer\n",
    "from nltk.stem import LancasterStemmer,WordNetLemmatizer\n",
    "from sklearn.linear_model import LogisticRegression,SGDClassifier\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "\n",
    "from textblob import TextBlob\n",
    "from textblob import Word\n",
    "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "19c6f52b-ffd1-4d5a-8165-5f59504fd1b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "producer = KafkaProducer(bootstrap_servers=['localhost:9092'],\n",
    "                         value_serializer=lambda x: dumps(x).encode('utf-8'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "85451085-2db9-4117-8107-e13c37733172",
   "metadata": {},
   "outputs": [],
   "source": [
    "#producer.send('demo_test', value=\"{'name': 'Consumer is was working well'}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3b65f243",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>review</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>One of the other reviewers has mentioned that ...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A wonderful little production. &lt;br /&gt;&lt;br /&gt;The...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>I thought this was a wonderful way to spend ti...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Basically there's a family where a little boy ...</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              review sentiment\n",
       "0  One of the other reviewers has mentioned that ...  positive\n",
       "1  A wonderful little production. <br /><br />The...  positive\n",
       "2  I thought this was a wonderful way to spend ti...  positive\n",
       "3  Basically there's a family where a little boy ...  negative\n",
       "4  Petter Mattei's \"Love in the Time of Money\" is...  positive"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#df = pd.read_csv(\"indexProcessed.csv\")\n",
    "IMDB_df = pd.read_csv('https://raw.githubusercontent.com/manojjha/BITS_Assignments/main/NLP/IMDB%20Dataset.csv?token=GHSAT0AAAAAACJXQMXDNSKYL6LUDAUK5SISZMICSXQ')\n",
    "IMDB_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f8c73c93",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'while True:\\n    dict_stock = df.sample(1).to_dict(orient=\"records\")[0]\\n    producer.send(\\'test\\', value=dict_stock)\\n    '"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''while True:\n",
    "    dict_stock = df.sample(1).to_dict(orient=\"records\")[0]\n",
    "    producer.send('test', value=dict_stock)\n",
    "    '''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "41164a98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 50000 entries, 0 to 49999\n",
      "Data columns (total 2 columns):\n",
      " #   Column     Non-Null Count  Dtype \n",
      "---  ------     --------------  ----- \n",
      " 0   review     50000 non-null  object\n",
      " 1   sentiment  50000 non-null  object\n",
      "dtypes: object(2)\n",
      "memory usage: 781.4+ KB\n"
     ]
    }
   ],
   "source": [
    "IMDB_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ea54b5dc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\manoj\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('stopwords')\n",
    "tokenizer = ToktokTokenizer()\n",
    "\n",
    "stopword_list = nltk.corpus.stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "29a5e47c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\manoj\\AppData\\Local\\Temp\\ipykernel_27052\\3752247428.py:4: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
      "  soup = BeautifulSoup(text, 'html.parser')\n"
     ]
    }
   ],
   "source": [
    "#Removing noise text\n",
    "\n",
    "def strip_html(text):\n",
    "  soup = BeautifulSoup(text, 'html.parser')\n",
    "  return soup.get_text()\n",
    "\n",
    "def remove_between_square_brackets(text):\n",
    "  return re.sub('\\[[^]]*\\]', '', text)\n",
    "\n",
    "def denoise_text(text):\n",
    "  text = strip_html(text)\n",
    "  text = remove_between_square_brackets(text)\n",
    "  return text\n",
    "\n",
    "IMDB_df['review'] = IMDB_df['review'].apply(denoise_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "58def824",
   "metadata": {},
   "outputs": [],
   "source": [
    "#remove special characters\n",
    "\n",
    "def remove_special_characters(text, remove_digits=True):\n",
    "  pattern = r'[^a-zA-Z0-9\\s]'\n",
    "  text = re.sub(pattern,'', text)\n",
    "  return text\n",
    "\n",
    "IMDB_df['review'] = IMDB_df['review'].apply(remove_special_characters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "01df39fb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'such', 'do', 'ain', 'won', 'hasn', 'her', 'those', 'should', 'it', 'own', 're', \"doesn't\", 'through', \"it's\", 'i', \"couldn't\", 'they', 'why', 'again', 'what', 'not', 'each', 'in', 'just', 'didn', 'our', 'or', 'from', 't', 'of', 'their', 'only', 'most', 'are', 'an', 'weren', \"didn't\", 'hers', 'the', \"wasn't\", 'no', 'to', 'is', 'was', 'm', 'about', 'this', 'did', \"you'll\", \"you'd\", 'whom', 'during', 'have', 'myself', 'needn', 'been', 'that', 've', \"won't\", \"should've\", 'now', 'while', 'how', 'out', \"hadn't\", 'and', 'being', 'with', 'when', \"you've\", 'shouldn', 'more', 'does', 'itself', 'ours', 'were', 'aren', 'for', 'until', \"needn't\", 'some', 'him', 'theirs', 'few', 'above', 'am', 'mustn', 'my', 'had', \"aren't\", 'then', \"mustn't\", 'up', \"shouldn't\", \"don't\", \"wouldn't\", 'but', 'he', 'which', 'its', 'because', 'very', 'isn', 'once', 'his', 'yourselves', 'into', 'over', \"weren't\", 's', 'at', 'if', 'she', 'after', 'there', 'off', 'further', 'by', 'mightn', 'me', 'as', 'o', \"that'll\", \"isn't\", 'doing', \"mightn't\", 'wouldn', 'between', 'be', 'any', \"hasn't\", \"you're\", 'ourselves', 'against', 'too', 'so', 'y', 'under', \"haven't\", 'can', 'hadn', 'will', 'nor', 'your', 'than', 'who', 'down', 'before', 'same', 'you', 'them', \"she's\", 'd', 'we', 'both', 'on', 'll', 'shan', \"shan't\", 'below', 'themselves', 'other', 'doesn', 'here', 'yours', 'yourself', 'ma', 'where', 'wasn', 'has', 'herself', 'couldn', 'a', 'don', 'all', 'having', 'himself', 'these', 'haven'}\n"
     ]
    }
   ],
   "source": [
    "# Removing stopwords\n",
    "\n",
    "stop = set(stopwords.words('english'))\n",
    "print(stop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "17c8fbcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_stopwords(text, is_lower_case = False):\n",
    "  tokens = tokenizer.tokenize(text)\n",
    "  tokens = [token.strip() for token in tokens]\n",
    "  if is_lower_case:\n",
    "    filtered_tokens = [token for token in tokens if token not in stopword_list ]\n",
    "\n",
    "  else:\n",
    "    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list ]\n",
    "\n",
    "  filtered_text = ' '.join(filtered_tokens)\n",
    "  return filtered_text\n",
    "\n",
    "IMDB_df['review'] = IMDB_df['review'].apply(remove_stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ba01b347",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>review</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>One reviewers mentioned watching 1 Oz episode ...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>wonderful little production filming technique ...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>thought wonderful way spend time hot summer we...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Basically theres family little boy Jake thinks...</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Petter Matteis Love Time Money visually stunni...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              review sentiment\n",
       "0  One reviewers mentioned watching 1 Oz episode ...  positive\n",
       "1  wonderful little production filming technique ...  positive\n",
       "2  thought wonderful way spend time hot summer we...  positive\n",
       "3  Basically theres family little boy Jake thinks...  negative\n",
       "4  Petter Matteis Love Time Money visually stunni...  positive"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "IMDB_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9352a2c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# text stemming\n",
    "\n",
    "def simple_stemmer(text):\n",
    "  ps = nltk.porter.PorterStemmer()\n",
    "  text = ' '.join([ps.stem(word) for word in text.split()])\n",
    "  return text\n",
    "\n",
    "IMDB_df['review'] = IMDB_df['review'].apply(simple_stemmer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b7f3594a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>review</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>cleaned_review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one review mention watch 1 oz episod youll hoo...</td>\n",
       "      <td>positive</td>\n",
       "      <td>one review mention watch 1 oz episod youll hoo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>wonder littl product film techniqu unassum old...</td>\n",
       "      <td>positive</td>\n",
       "      <td>wonder littl product film techniqu unassum old...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>thought wonder way spend time hot summer weeke...</td>\n",
       "      <td>positive</td>\n",
       "      <td>thought wonder way spend time hot summer weeke...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>basic there famili littl boy jake think there ...</td>\n",
       "      <td>negative</td>\n",
       "      <td>basic there famili littl boy jake think there ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>petter mattei love time money visual stun film...</td>\n",
       "      <td>positive</td>\n",
       "      <td>petter mattei love time money visual stun film...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              review sentiment  \\\n",
       "0  one review mention watch 1 oz episod youll hoo...  positive   \n",
       "1  wonder littl product film techniqu unassum old...  positive   \n",
       "2  thought wonder way spend time hot summer weeke...  positive   \n",
       "3  basic there famili littl boy jake think there ...  negative   \n",
       "4  petter mattei love time money visual stun film...  positive   \n",
       "\n",
       "                                      cleaned_review  \n",
       "0  one review mention watch 1 oz episod youll hoo...  \n",
       "1  wonder littl product film techniqu unassum old...  \n",
       "2  thought wonder way spend time hot summer weeke...  \n",
       "3  basic there famili littl boy jake think there ...  \n",
       "4  petter mattei love time money visual stun film...  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "IMDB_df['cleaned_review'] = IMDB_df['review']\n",
    "IMDB_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "96e291d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "#  Create X and y objects\n",
    "X = IMDB_df['cleaned_review']\n",
    "y = IMDB_df['sentiment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "45a91519",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 1 1 ... 0 0 0]\n"
     ]
    }
   ],
   "source": [
    "label_encoder = LabelEncoder()\n",
    "\n",
    "y_encoded = label_encoder.fit_transform(y)\n",
    "\n",
    "print(y_encoded)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "70945425",
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_vectorizer = TfidfVectorizer(max_features=5000)\n",
    "\n",
    "X_tfidf = tfidf_vectorizer.fit_transform(X)\n",
    "\n",
    "X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5281247e",
   "metadata": {},
   "source": [
    "#### Train Support Vector machine model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "8cf43a20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize and train the SVM model\n",
    "svm_tfidf = SVC(kernel='linear', random_state=42)\n",
    "svm_tfidf.fit(X_train_tfidf, y_train_tfidf)\n",
    "\n",
    "y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "32866b2b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Confusion Matrix (TFIDF):\n",
      "[[4312  649]\n",
      " [ 509 4530]]\n"
     ]
    }
   ],
   "source": [
    "conf_matrix_tfidf = confusion_matrix(y_test_tfidf, y_pred_tfidf)\n",
    "print(\"Confusion Matrix (TFIDF):\")\n",
    "print(conf_matrix_tfidf)\n",
    "\n",
    "#accuracy = svm_tfidf.score(X_train_tfidf, y_train_tfidf)\n",
    "#print(\"Training accuracy:\", accuracy * 100, \"%\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "f1f74fc6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<bound method Series.to_dict of 0        one review mention watch 1 oz episod youll hoo...\n",
      "1        wonder littl product film techniqu unassum old...\n",
      "2        thought wonder way spend time hot summer weeke...\n",
      "3        basic there famili littl boy jake think there ...\n",
      "4        petter mattei love time money visual stun film...\n",
      "                               ...                        \n",
      "49995    thought movi right good job wasnt creativ orig...\n",
      "49996    bad plot bad dialogu bad act idiot direct anno...\n",
      "49997    cathol taught parochi elementari school nun ta...\n",
      "49998    im go disagre previou comment side maltin one ...\n",
      "49999    one expect star trek movi high art fan expect ...\n",
      "Name: review, Length: 50000, dtype: object>\n"
     ]
    }
   ],
   "source": [
    "print(IMDB_df['review'].to_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "333d564c",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32mc:\\Users\\manoj\\Documents\\Python\\DataEng\\DownloadPro\\Real-time-stock-analysis\\KafkaProducer-SVM_Producer1.ipynb Cell 24\u001b[0m line \u001b[0;36m1\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m message \u001b[39m=\u001b[39m {\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m     \u001b[39m'\u001b[39m\u001b[39mreview\u001b[39m\u001b[39m'\u001b[39m: review_text,\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m     \u001b[39m'\u001b[39m\u001b[39mpredicted_sentiment\u001b[39m\u001b[39m'\u001b[39m: predicted_sentiment\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m }\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=17'>18</a>\u001b[0m producer\u001b[39m.\u001b[39msend(\u001b[39m'\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m'\u001b[39m, value\u001b[39m=\u001b[39mmessage)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/manoj/Documents/Python/DataEng/DownloadPro/Real-time-stock-analysis/KafkaProducer-SVM_Producer1.ipynb#X34sZmlsZQ%3D%3D?line=18'>19</a>\u001b[0m time\u001b[39m.\u001b[39msleep(\u001b[39m5\u001b[39m)\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "for index, row in IMDB_df.iterrows():\n",
    "    review_text = row['review']\n",
    "    cleaned_text = simple_stemmer(review_text)\n",
    "    \n",
    "    new_text_tfidf = tfidf_vectorizer.transform([cleaned_text])\n",
    "    \n",
    "    prediction = svm_tfidf.predict(new_text_tfidf)\n",
    "    \n",
    "    predicted_sentiment = label_encoder.inverse_transform(prediction)[0]\n",
    "    \n",
    "    message = {\n",
    "        'review': review_text,\n",
    "        'predicted_sentiment': predicted_sentiment\n",
    "    }\n",
    "\n",
    "    producer.send('test', value=message)\n",
    "    time.sleep(5)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd3b35eb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}