Skip to content

Instantly share code, notes, and snippets.

@pankajti
Created July 26, 2020 01:41
Show Gist options
  • Save pankajti/c75d96d1ef1c3cc405f532b769f986b5 to your computer and use it in GitHub Desktop.
Save pankajti/c75d96d1ef1c3cc405f532b769f986b5 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"import numpy as np\n",
"import os\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"def split_input_target(chunk):\n",
" input_text = chunk[:-1]\n",
" target_text = chunk[1:]\n",
" return input_text, target_text\n",
"\n",
"def build_model(vocab_size, embedding_dim, rnn_units, batch_size):\n",
" model = tf.keras.Sequential([\n",
" tf.keras.layers.Embedding(vocab_size, embedding_dim,\n",
" batch_input_shape=[batch_size, None]),\n",
" tf.keras.layers.GRU(rnn_units,\n",
" return_sequences=True,\n",
" stateful=True,\n",
" recurrent_initializer='glorot_uniform'),\n",
" tf.keras.layers.Dense(vocab_size)\n",
" ])\n",
" return model\n",
"\n",
"def loss(labels, logits):\n",
" return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)\n",
"\n",
" \n",
"def generate_text(model, start_string):\n",
" # Evaluation step (generating text using the learned model)\n",
"\n",
" # Number of characters to generate\n",
" num_generate = 1000\n",
"\n",
" # Converting our start string to numbers (vectorizing)\n",
" input_eval = [char2idx[s] for s in start_string]\n",
" input_eval = tf.expand_dims(input_eval, 0)\n",
"\n",
" # Empty string to store our results\n",
" text_generated = []\n",
"\n",
" # Low temperatures results in more predictable text.\n",
" # Higher temperatures results in more surprising text.\n",
" # Experiment to find the best setting.\n",
" temperature = 1.0\n",
"\n",
" # Here batch size == 1\n",
" model.reset_states()\n",
" for i in range(num_generate):\n",
" predictions = model(input_eval)\n",
" # remove the batch dimension\n",
" predictions = tf.squeeze(predictions, 0)\n",
"\n",
" # using a categorical distribution to predict the character returned by the model\n",
" predictions = predictions / temperature\n",
" predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()\n",
"\n",
" # We pass the predicted character as the next input to the model\n",
" # along with the previous hidden state\n",
" input_eval = tf.expand_dims([predicted_id], 0)\n",
"\n",
" text_generated.append(idx2char[predicted_id])\n",
"\n",
" return (start_string + ''.join(text_generated))"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Length of text: 1372948 characters\n"
]
}
],
"source": [
"with open(\"/Users/pankaj/dev/git/smu/nlp337/marathi/mrityunjay.txt\") as f:\n",
" lines = f.readlines()\n",
"\n",
"text = \"\".join([l for l in lines if l.strip()!=''])\n",
"\n",
"# length of text is the number of characters in it\n",
"print ('Length of text: {} characters'.format(len(text)))\n",
"vocab = sorted(set(text))\n",
"char2idx = {u:i for i, u in enumerate(vocab)}\n",
"idx2char = np.array(vocab)\n",
"text_as_int = np.array([char2idx[c] for c in text])\n",
"# The maximum length sentence we want for a single input in characters\n",
"seq_length = 30\n",
"examples_per_epoch = len(text)//(seq_length+1)\n",
"# Create training examples / targets\n",
"char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)\n",
"sequences = char_dataset.batch(seq_length+1, drop_remainder=True)\n",
"dataset = sequences.map(split_input_target)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"from nltk import word_tokenize"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"words = word_tokenize(text)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"35239"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(set(words))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"c = Counter(words)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"import operator\n",
"x = c\n",
"sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('.', 11379),\n",
" ('!', 9384),\n",
" (',', 8636),\n",
" ('‘', 5324),\n",
" ('’', 3439),\n",
" ('मी', 3209),\n",
" ('?', 2645),\n",
" ('आिण', 2176),\n",
" ('आहे', 2129),\n",
" ('ते', 1989),\n",
" ('तो', 1915),\n",
" ('मला', 1893),\n",
" ('होता', 1680),\n",
" ('\\U00102861या', 1605),\n",
" ('नाही', 1583),\n",
" ('होतं', 1479),\n",
" ('काय', 1335),\n",
" ('मा\\U0010285eया', 1322),\n",
" ('पण', 1265),\n",
" ('एक', 1256),\n",
" ('होती', 1161),\n",
" ('या', 1139),\n",
" ('होते', 1093),\n",
" ('”', 1028),\n",
" ('ी', 1022),\n",
" ('हे', 1006),\n",
" ('तर', 941),\n",
" ('ती', 922),\n",
" ('–', 914),\n",
" ('की', 896),\n",
" ('आप\\U0010286bया', 863),\n",
" ('ा', 845),\n",
" (':', 841),\n",
" ('\\U00102861या\\U0010285cया', 828),\n",
" ('\\U00102861याला', 769),\n",
" ('\\U00102861यानं', 734),\n",
" ('आता', 729),\n",
" ('\\U00102869हणून', 644),\n",
" ('सव\\U00102712', 635),\n",
" ('पर्', 569),\n",
" ('माझं', 519),\n",
" ('असं', 517),\n",
" ('तरी', 501),\n",
" ('एका', 500),\n",
" ('ं', 484),\n",
" ('न\\U0010286dहतं', 484),\n",
" ('क\\U001028f6न', 481),\n",
" ('तू', 480),\n",
" ('आज', 464),\n",
" ('हात', 459),\n",
" ('घेऊन', 457),\n",
" ('*', 440),\n",
" ('का', 436),\n",
" ('कण\\U00102712', 428),\n",
" ('काही', 428),\n",
" ('लागले', 425),\n",
" ('ू', 423),\n",
" ('हा', 414),\n",
" ('ही', 404),\n",
" ('होतो', 404),\n",
" ('\\U00102712', 402),\n",
" ('तुला', 400),\n",
" ('\\U00102861यां\\U0010285cया', 387),\n",
" ('मन', 376),\n",
" ('झालं', 375),\n",
" ('आ\\U00102869ही', 366),\n",
" ('लागला', 358),\n",
" ('\\U00102869हणजे', 354),\n",
" ('कु', 349),\n",
" ('के', 339),\n",
" ('केवळ', 335),\n",
" ('झाला', 332),\n",
" ('\\U00102870वत', 330),\n",
" ('\\U00102861याचं', 324),\n",
" ('खाली', 319),\n",
" ('आहेत', 317),\n",
" ('करीत', 315),\n",
" ('न\\U0010286dहता', 314),\n",
" ('कधीच', 303),\n",
" ('आला', 299),\n",
" ('दर्', 298),\n",
" ('असतं', 291),\n",
" ('...', 291),\n",
" ('कारण', 290),\n",
" ('\\U00102861यांना', 290),\n",
" ('न', 283),\n",
" ('माझा', 282),\n",
" ('डोळे', 281),\n",
" ('माझी', 278),\n",
" ('पुढं', 272),\n",
" ('\\U00102861याचा', 267),\n",
" ('दोन', 266),\n",
" (';', 264),\n",
" ('लागली', 263),\n",
" ('उभा', 260),\n",
" ('पु\\U00102864हा', 256),\n",
" ('मात्र', 256),\n",
" ('\\U00102861यांनी', 255),\n",
" ('झाली', 253),\n",
" ('अनेक', 252),\n",
" ('असा', 252),\n",
" ('सवानं\\U00102712', 250),\n",
" ('असेल', 250),\n",
" ('पाहत', 246),\n",
" ('िकती', 243),\n",
" ('दु', 241),\n",
" ('होत', 238),\n",
" ('तु\\U0010285eया', 238),\n",
" ('परत', 236),\n",
" ('लागलं', 236),\n",
" ('दरू', 233),\n",
" ('कोण', 232),\n",
" ('क\\U001028f6', 231),\n",
" ('िन', 229),\n",
" ('गेला', 225),\n",
" ('केला', 225),\n",
" ('महाराज', 224),\n",
" ('\\U00102861याची', 217),\n",
" ('जीवन', 216),\n",
" ('बाण', 214),\n",
" ('बाहेर', 214),\n",
" ('कधी', 212),\n",
" ('वेळी', 211),\n",
" ('वर', 211),\n",
" ('आहेस', 211),\n",
" ('अशा', 210),\n",
" ('हो\\U00102861या', 210),\n",
" ('काहीच', 209),\n",
" ('मान', 209),\n",
" ('केलं', 208),\n",
" ('िवचारलं', 207),\n",
" ('होऊन', 205),\n",
" ('आलो', 204),\n",
" ('मग', 204),\n",
" ('दुया\\U00102707धन', 204),\n",
" ('अशी', 200),\n",
" ('आम\\U0010285cया', 200),\n",
" ('हाती', 198),\n",
" ('येत', 195),\n",
" ('\\U00102869हणाला', 194),\n",
" ('झाले', 193),\n",
" ('न\\U0010286dहती', 190),\n",
" ('िदवस', 188),\n",
" ('ित\\U0010285cया', 187),\n",
" ('हातातील', 187),\n",
" ('\\U00102869हणनू', 186),\n",
" ('पािहलं', 183),\n",
" ('वाटलं', 183),\n",
" ('अंगराज', 182),\n",
" ('ितला', 179),\n",
" ('वाटत', 176),\n",
" ('आपण', 174),\n",
" ('असे', 173),\n",
" ('राजा', 173),\n",
" ('तसं', 169),\n",
" ('केली', 166),\n",
" ('\\U00102869हणाले', 166),\n",
" ('िदसत', 165),\n",
" ('\\U00102861याचे', 164),\n",
" ('पाच', 163),\n",
" ('गेले', 162),\n",
" ('पाहून', 160),\n",
" ('माहीत', 159),\n",
" ('आलं', 159),\n",
" ('शोण', 158),\n",
" ('हातात', 158),\n",
" ('शेवटी', 158),\n",
" ('लागलो', 157),\n",
" ('आपली', 156),\n",
" ('एकच', 156),\n",
" ('नाहीत', 152),\n",
" ('सोडून', 152),\n",
" ('जण', 152),\n",
" ('सरळ', 151),\n",
" ('\\U0010285dया', 151),\n",
" ('िवचार', 150),\n",
" ('मागं', 149),\n",
" ('ला', 149),\n",
" ('कसं', 149),\n",
" ('ितनं', 148),\n",
" ('आली', 148),\n",
" ('“', 148),\n",
" ('गेली', 147),\n",
" ('\\U00102855णभर', 146),\n",
" ('एकदा', 145),\n",
" ('उभे', 145),\n",
" ('िदलं', 145),\n",
" ('असतो', 144),\n",
" ('अ\\U0010286eव\\U00102861थामा', 144),\n",
" ('तसा', 143),\n",
" ('\\U00102855णात', 143),\n",
" ('पांडव', 143),\n",
" ('आपलं', 140),\n",
" ('सांिगतलं', 140),\n",
" ('घेत', 139),\n",
" ('\\U00102869हणत', 139),\n",
" ('िनघून', 139),\n",
" ('िपतामह', 139),\n",
" ('असतात', 138),\n",
" ('असते', 137),\n",
" ('असं\\U00102859य', 137),\n",
" ('उंच', 137),\n",
" ('आ\\U00102869हाला', 137),\n",
" ('तरीही', 135),\n",
" ('शांत', 135),\n",
" ('त', 135),\n",
" ('े', 133),\n",
" ('पुत्र', 132),\n",
" ('गु\\U001028f6', 130),\n",
" ('रथ', 129),\n",
" ('युवराज', 129),\n",
" ('श\\U00102867द', 128),\n",
" ('काहीतरी', 128),\n",
" ('\\U00102869हणूनच', 127),\n",
" ('राजमाता', 127),\n",
" ('मनात', 126),\n",
" ('आवाज', 125),\n",
" ('कुठं', 125),\n",
" ('वृषाली', 124),\n",
" ('कुं', 124),\n",
" ('तयार', 124),\n",
" ('प्र\\U0010286eन', 123),\n",
" ('देत', 123),\n",
" ('जा', 123),\n",
" ('गेलं', 122),\n",
" ('श\\U00102858य', 122),\n",
" ('रा\\U0010285dय', 122),\n",
" ('अजु\\U00102712न', 122),\n",
" ('स\\U00102861य', 121),\n",
" ('इंदर्', 121),\n",
" ('पुतर्', 121),\n",
" ('करणार', 121),\n",
" ('अंगावर', 120),\n",
" ('उ\\U00102898र', 120),\n",
" ('घेतलं', 119),\n",
" ('इथं', 118),\n",
" ('ितथं', 118),\n",
" ('कर\\U00102860यासाठी', 118),\n",
" ('िदला', 118),\n",
" ('पािहजे', 117),\n",
" ('दो\\U00102864ही', 116),\n",
" ('मना\\U0010285cया', 115),\n",
" ('अिधक', 114),\n",
" ('तु\\U00102869ही', 114),\n",
" ('आपला', 113),\n",
" ('माझे', 113),\n",
" ('एकदम', 111),\n",
" ('होणार', 111),\n",
" ('क\\U0010286bपना', 110),\n",
" ('तसाच', 110),\n",
" ('कणा\\U00102712', 109),\n",
" ('सवांत\\U00102712', 109),\n",
" ('िदवशी', 109),\n",
" ('िवशाल', 108),\n",
" ('आले', 108),\n",
" ('जात', 107),\n",
" ('िदली', 106),\n",
" ('घालून', 106),\n",
" ('उभी', 105),\n",
" ('पाणी', 105),\n",
" ('येऊन', 105),\n",
" ('सयू', 105),\n",
" ('प्र\\U00102861येक', 105),\n",
" ('आहे.', 104),\n",
" ('आत', 104),\n",
" ('वीर', 104),\n",
" ('शरीर', 103),\n",
" ('पडला', 103),\n",
" ('चार', 102),\n",
" ('\\U00102870प\\U0010286fट', 102),\n",
" ('नये', 102),\n",
" ('ध\\U001028f6न', 102),\n",
" ('सूतपुत्र', 102),\n",
" ('तेच', 101),\n",
" ('गंगे\\U0010285cया', 101),\n",
" ('देऊन', 100),\n",
" ('सोनेरी', 99),\n",
" ('ानं', 98),\n",
" ('न\\U0010286dहते', 97),\n",
" ('कधीही', 97),\n",
" ('कुंडलं', 97),\n",
" ('असता', 97),\n",
" ('सग\\U0010286cया', 96),\n",
" ('तसे', 96),\n",
" ('सगळं', 96),\n",
" ('माग\\U00102712', 96),\n",
" ('\\U00102869हणालो', 96),\n",
" ('एकाच', 96),\n",
" ('येणार', 96),\n",
" ('जे', 96),\n",
" ('भीम', 96),\n",
" ('म\\U00102870तक', 96),\n",
" ('छे', 95),\n",
" ('सेनापती', 95),\n",
" ('नसतं', 94),\n",
" ('झालो', 94),\n",
" ('वंदन', 94),\n",
" ('शकत', 94),\n",
" ('ना', 94),\n",
" ('तुझा', 94),\n",
" ('नाव', 93),\n",
" ('आपले', 92),\n",
" ('दान', 92),\n",
" ('असले\\U0010286bया', 91),\n",
" ('वळ', 91),\n",
" ('रािहला', 91),\n",
" ('जाऊन', 90),\n",
" ('सारथी', 90),\n",
" ('न\\U0010286dहे', 90),\n",
" ('पाय', 90),\n",
" ('उभं', 89),\n",
" ('आ\\U00102856ा', 89),\n",
" ('शर्', 89),\n",
" ('वेळ', 88),\n",
" ('भ\\U0010286dय', 88),\n",
" ('धनु\\U0010286fय', 88),\n",
" ('िनधार\\U00102712', 86),\n",
" ('तुझी', 85),\n",
" ('घोडे', 85),\n",
" ('मा\\U0010285eयाकडे', 85),\n",
" ('च', 85),\n",
" ('होऊ', 85),\n",
" ('पाहताच', 85),\n",
" ('तसंच', 84),\n",
" ('उठून', 84),\n",
" ('उंचावून', 84),\n",
" ('कसा', 84),\n",
" ('कुणी', 83),\n",
" ('आठवण', 83),\n",
" ('िदशेनं', 83),\n",
" ('नेहमीच', 82),\n",
" ('हि\\U00102870तनापुरात', 82),\n",
" ('\\U0010285cया', 81),\n",
" ('एखा\\U001028a2ा', 81),\n",
" ('\\U00102861यांचा', 80),\n",
" ('वेग', 80),\n",
" ('असावं', 80),\n",
" ('शासन', 80),\n",
" ('\\U00102861यात', 79),\n",
" ('याची', 79),\n",
" ('नेहमी', 79),\n",
" ('कधी-कधी', 79),\n",
" ('ोण', 79),\n",
" ('घेतला', 79),\n",
" ('रािहले', 79),\n",
" ('कणाच\\U00102712', 79),\n",
" ('अितशय', 78),\n",
" ('सांगत', 78),\n",
" ('िजवंत', 77),\n",
" ('पाठीवर', 77),\n",
" ('समोर', 77),\n",
" ('असंच', 77),\n",
" ('ते\\U0010286dहा', 77),\n",
" ('िनमा\\U00102712ण', 77),\n",
" ('व', 77),\n",
" ('वृ\\U0010289d', 77),\n",
" ('जाऊ', 76),\n",
" ('दादा', 76),\n",
" ('काढून', 76),\n",
" ('हेच', 76),\n",
" ('मीही', 76),\n",
" ('कुणालाही', 76),\n",
" ('त\\U00102920ड', 76),\n",
" ('दुया\\U00102707धनानं', 76),\n",
" ('करणारा', 75),\n",
" ('खरोखरच', 74),\n",
" ('जवळ', 74),\n",
" ('भ\\U001028f6न', 74),\n",
" ('िकतीतरी', 74),\n",
" ('\\U00102861यासाठी', 74),\n",
" ('भ\\U00102858कम', 74),\n",
" ('दुवास\\U00102712', 74),\n",
" ('जाणार', 73),\n",
" ('वाटू', 73),\n",
" ('याचं', 73),\n",
" ('टाकून', 73),\n",
" ('िदि\\U0010285aवजयी', 73),\n",
" ('\\U00102861यांची', 72),\n",
" ('सांग', 72),\n",
" ('बाबा', 72),\n",
" ('नको', 72),\n",
" ('नकोस', 72),\n",
" ('वाता\\U00102712', 72),\n",
" ('म\\U00102870तकावर', 72),\n",
" ('काल', 72),\n",
" ('िद\\U0010286dय', 71),\n",
" ('पवू', 71),\n",
" ('धावत', 71),\n",
" ('सगळे', 71),\n",
" ('होईल', 71),\n",
" ('कणा\\U00102712\\U0010285cया', 71),\n",
" ('सहा', 70),\n",
" ('अर\\U00102860यात', 70),\n",
" ('गेलो', 70),\n",
" ('घेतली', 70),\n",
" ('माता', 70),\n",
" ('ठेवून', 70),\n",
" ('खड्ग', 70),\n",
" ('पाहताना', 70),\n",
" ('जग', 69),\n",
" ('दुस\\U00102876या', 69),\n",
" ('येऊ', 69),\n",
" ('लाग\\U0010286bया', 69),\n",
" ('पडली', 69),\n",
" ('नगरात', 69),\n",
" ('कु\\U001028f6ं\\U0010285cया', 69),\n",
" ('तुझं', 68),\n",
" ('उ\\U00102898रीय', 68),\n",
" ('झटकन', 68),\n",
" ('नाहीतर', 68),\n",
" ('राहून', 68),\n",
" ('तीन', 67),\n",
" ('महान', 67),\n",
" ('घटका', 67),\n",
" ('बसला', 67),\n",
" ('मुख', 67),\n",
" ('प्रचंड', 66),\n",
" ('येईल', 66),\n",
" ('आपणाला', 66),\n",
" ('िकतीही', 66),\n",
" ('हि\\U00102870तनापूर', 66),\n",
" ('खां\\U001028a2ावर', 66),\n",
" ('\\U0010285dये\\U0010286fठ', 66),\n",
" ('फारच', 65),\n",
" ('आकाशात', 65),\n",
" ('येताच', 65),\n",
" ('वेळा', 64),\n",
" ('आणखी', 64),\n",
" ('म\\U00102863येच', 64),\n",
" ('तशी', 64),\n",
" ('राजवाड\\U00102710ा\\U0010285cया', 64),\n",
" ('राजदंड', 64),\n",
" ('युिधि\\U0010286fठर', 64),\n",
" ('\\U00102861यांचं', 63),\n",
" ('अखंड', 63),\n",
" ('करणा\\U00102876या', 63),\n",
" ('फे', 63),\n",
" ('होय', 63),\n",
" ('ल\\U00102855', 63),\n",
" ('पाहू', 63),\n",
" ('इ\\U0010285cछा', 63),\n",
" ('व\\U00102870त्रं', 63),\n",
" ('सवाचं\\U00102712', 63),\n",
" ('कुणाला', 63),\n",
" ('अगदी', 62),\n",
" ('करतो', 62),\n",
" ('सांगता', 62),\n",
" ('वाटे', 62),\n",
" ('दृ\\U0010286fटी', 62),\n",
" ('असतील', 62),\n",
" ('सहज', 62),\n",
" ('चंदर्', 62),\n",
" ('लं', 62),\n",
" ('उज\\U0010286dया', 62),\n",
" ('कशाला', 62),\n",
" ('यु\\U0010289d', 62),\n",
" ('जीवनभर', 62),\n",
" ('साम\\U00102862य\\U00102712', 61),\n",
" ('पार', 61),\n",
" ('कुणीतरी', 61),\n",
" ('िवजय', 61),\n",
" ('भाग', 61),\n",
" ('आशीवाद\\U00102712', 61),\n",
" ('दुया\\U00102707धनाला', 61),\n",
" ('पांडवां\\U0010285cया', 61),\n",
" ('श्रीकृ\\U0010286fण', 60),\n",
" ('सैिनक', 60),\n",
" ('बोलत', 60),\n",
" ('गु\\U001028f5देव', 60),\n",
" ('गरगर', 60),\n",
" ('ल\\U00102855ात', 60),\n",
" ('गदा', 60),\n",
" ('ितचं', 60),\n",
" ('वाटतं', 59),\n",
" ('पा\\U00102860यात', 59),\n",
" ('हातानं', 59),\n",
" ('आहोत', 59),\n",
" ('तोच', 59),\n",
" ('एकही', 59),\n",
" ('अजु\\U00102712ना\\U0010285cया', 59),\n",
" ('भयानक', 59),\n",
" ('सै\\U00102864य', 59),\n",
" ('मलाही', 58),\n",
" ('झाले\\U0010286bया', 58),\n",
" ('लागणार', 58),\n",
" ('\\U0010286eवास', 58),\n",
" ('टाकलं', 58),\n",
" ('कणा\\U00102712ला', 58),\n",
" ('जगात', 58),\n",
" ('करता', 58),\n",
" ('स\\U00102861यसेन', 58),\n",
" ('दुया\\U00102707धना\\U0010285cया', 58),\n",
" ('मादर्', 58),\n",
" ('दे', 57),\n",
" ('कुठंतरी', 57),\n",
" ('मुळीच', 57),\n",
" ('आनंद', 57),\n",
" ('मा\\U0010285eयासमोर', 57),\n",
" ('क्\\U001028f6र', 57),\n",
" ('असताना', 56),\n",
" ('कशी', 56),\n",
" ('ऐकून', 56),\n",
" ('प\\U00102855ी', 56),\n",
" ('अचूक', 56),\n",
" ('चालू', 56),\n",
" ('पडलं', 56),\n",
" ('नाही.', 56),\n",
" ('नसेल', 56),\n",
" ('प्रय\\U00102861न', 55),\n",
" ('वाट', 55),\n",
" ('ितचा', 55),\n",
" ('ज\\U00102864म', 55),\n",
" ('असती', 55),\n",
" ('चल', 55),\n",
" ('आनंदानं', 55),\n",
" ('अमा\\U00102861य', 55),\n",
" ('िदसू', 55),\n",
" ('भी\\U0010286fम', 55),\n",
" ('मातर्', 55),\n",
" ('तु\\U00102869हाला', 55),\n",
" ('पूण\\U00102712', 55),\n",
" ('माला', 55),\n",
" ('दहा', 54),\n",
" ('डो\\U0010286cयांत', 54),\n",
" ('एवढा', 54),\n",
" ('\\U00102870त्री', 54),\n",
" ('असाच', 54),\n",
" ('कोण\\U00102861याही', 54),\n",
" ('इतर', 54),\n",
" ('तेज\\U00102870वी', 54),\n",
" ('घेऊ', 54),\n",
" ('िविचत्र', 54),\n",
" ('न\\U0010286dहतो', 54),\n",
" ('गोल', 54),\n",
" ('प\\U00102861नी', 54),\n",
" ('मनाचा', 54),\n",
" ('कृ', 54),\n",
" ('महाराणी', 54),\n",
" ('पाचं', 54),\n",
" ('पांडवांना', 54),\n",
" ('देऊ', 53),\n",
" ('एकटक', 53),\n",
" ('राजे', 53),\n",
" ('इतका', 53),\n",
" ('रात्री', 53),\n",
" ('बसले', 53),\n",
" ('सु\\U001028f5वात', 53),\n",
" ('लहान', 53),\n",
" ('नगरजन', 53),\n",
" ('चरणांवर', 53),\n",
" ('तुम\\U0010285cया', 53),\n",
" ('-', 53),\n",
" ('िविवध', 52),\n",
" ('ऐकू', 52),\n",
" ('सुंदर', 52),\n",
" ('धावू', 52),\n",
" ('हो', 52),\n",
" ('असावा', 52),\n",
" ('रा\\U0010285dयात', 52),\n",
" ('प्रेम', 52),\n",
" ('\\U00102861याच', 52),\n",
" ('शुभ्र', 52),\n",
" ('मा\\U0010285eयासाठी', 52),\n",
" ('बंधू', 52),\n",
" ('टाकला', 52),\n",
" ('जणू', 52),\n",
" ('आखाड\\U00102710ात', 52),\n",
" ('कुंती', 51),\n",
" ('कमी', 51),\n",
" ('नाहीस', 51),\n",
" ('वाकून', 51),\n",
" ('पडले', 51),\n",
" ('सा\\U00102855ात', 51),\n",
" ('उ\\U001028a2ा', 51),\n",
" ('असलेला', 51),\n",
" ('हसत', 51),\n",
" ('यो\\U0010289dा', 51),\n",
" ('असूनही', 51),\n",
" ('सभागृहात', 51),\n",
" ('नीट', 50),\n",
" ('जाणीव', 50),\n",
" ('पडत', 50),\n",
" ('दुसरा', 50),\n",
" ('यो\\U0010285aय', 50),\n",
" ('होताच', 50),\n",
" ('त\\U00102865त', 50),\n",
" ('अजूनही', 49),\n",
" ('मलाच', 49),\n",
" ('मोठ\\U00102710ानं', 49),\n",
" ('नसे', 49),\n",
" ('खरं', 49),\n",
" ('आ\\U0010286eचयान\\U00102712', 49),\n",
" ('िवदुर', 49),\n",
" ('\\U00102855ित्रय', 49),\n",
" ('अस\\U001028bc', 49),\n",
" ('कणान\\U00102712', 49),\n",
" ('दुयोध\\U00102712', 48),\n",
" ('िवस\\U001028f6न', 48),\n",
" ('कधीतरी', 48),\n",
" ('\\U00102869हणाली', 48),\n",
" ('तशीच', 48),\n",
" ('का\\U0010286cया', 48),\n",
" ('वषं', 48),\n",
" ('करताना', 48),\n",
" ('क\\U00102855ात', 48),\n",
" ('दगडी', 48),\n",
" ('अजु\\U00102712नानं', 48),\n",
" ('अजु\\U00102712नाला', 48),\n",
" ('जर', 48),\n",
" ('सगळीकडे', 48),\n",
" ('दतू', 48),\n",
" ('\\U001028f6ं', 48),\n",
" ('ीकृ', 48),\n",
" ('पिवत्र', 47),\n",
" ('लावून', 47),\n",
" ('दश\\U00102712न', 47),\n",
" ('जा\\U00102860यासाठी', 47),\n",
" ('फार', 47),\n",
" ('ली', 47),\n",
" ('दुसरं', 47),\n",
" ('प्र\\U00102861य\\U00102855', 47),\n",
" ('प्रसंग', 47),\n",
" ('श\\U0010286bय', 47),\n",
" ('यु\\U0010289dात', 47),\n",
" ('मु\\U00102858त', 46),\n",
" ('\\U00102869हणे', 46),\n",
" ('उगाच', 46),\n",
" ('लागत', 46),\n",
" ('फेकून', 46),\n",
" ('करावं', 46),\n",
" ('काळ', 46),\n",
" ('अभे\\U001028a2', 46),\n",
" ('कोण\\U00102861या', 46),\n",
" ('पडू', 46),\n",
" ('िनदर्', 46),\n",
" ('\\U00102861यावर', 46),\n",
" ('कु\\U001028f6ंचा', 46),\n",
" ('फु', 46),\n",
" ('\\U00102870वागत', 46),\n",
" ('वा', 45),\n",
" ('मनाला', 45),\n",
" ('अफाट', 45),\n",
" ('शोणाला', 45),\n",
" ('असणार', 45),\n",
" ('जागा', 45),\n",
" ('मीच', 45),\n",
" ('प्रथम', 45),\n",
" ('इत\\U00102858यात', 45),\n",
" ('वेध', 45),\n",
" ('असो', 45),\n",
" ('उ\\U00102868या', 45),\n",
" ('धवल', 45),\n",
" ('डा\\U0010286dया', 45),\n",
" ('ितची', 45),\n",
" ('माणूस', 45),\n",
" ('मोठा', 44),\n",
" ('असावी', 44),\n",
" ('समजत', 44),\n",
" ('िमटून', 44),\n",
" ('एखादा', 44),\n",
" ('घातली', 44),\n",
" ('बरोबर', 44),\n",
" ('राहणार', 44),\n",
" ('भेट', 44),\n",
" ('सेवक', 44),\n",
" ('यां\\U0010285cया', 44),\n",
" ('व\\U00102870त्र', 44),\n",
" ('िश\\U0010286fय', 44),\n",
" ('कोणता', 44),\n",
" ('ीच\\U00102712', 44),\n",
" ('रािहलं', 43),\n",
" ('पुन', 43),\n",
" ('झा\\U0010286bया', 43),\n",
" ('िव\\U0010286eवास', 43),\n",
" ('िन\\U0010286cया', 43),\n",
" ('होई', 43),\n",
" ('सु\\U001028f6', 43),\n",
" ('याचा', 43),\n",
" ('\\U00102861यामुळे', 43),\n",
" ('सवां\\U00102712\\U0010285cया', 43),\n",
" ('घेतले', 43),\n",
" ('एक-एक', 43),\n",
" ('यो\\U0010289dे', 43),\n",
" ('आमचा', 43),\n",
" ('पायदंड\\U00102710ा', 43),\n",
" ('बोल', 43),\n",
" ('\\U00102870मृती', 43),\n",
" ('रोज', 43),\n",
" ('उलट', 43),\n",
" ('कळलं', 43),\n",
" ('\\U00102870तर्', 43),\n",
" ('सव\\U00102712च', 43),\n",
" ('दासी', 43),\n",
" ('मह\\U00102709दर्', 43),\n",
" ('पु\\U0010286fट', 42),\n",
" ('जाताना', 42),\n",
" ('देणार', 42),\n",
" ('कवच', 42),\n",
" ('पराक्रमी', 42),\n",
" ('मुकुट', 42),\n",
" ('पसरली', 42),\n",
" ('झालेला', 42),\n",
" ('ठेवलं', 42),\n",
" ('ह\\U00102898ी', 42),\n",
" ('अ\\U00102864य', 42),\n",
" ('तेही', 41),\n",
" ('एवढं', 41),\n",
" ('अंत', 41),\n",
" ('\\U00102869हटलं', 41),\n",
" ('बसून', 41),\n",
" ('रथातून', 41),\n",
" ('\\U00102861यांचे', 41),\n",
" ('पराक्रम', 41),\n",
" ('चा', 41),\n",
" ('कुणालाच', 41),\n",
" ('अ\\U0010286eव\\U00102861था\\U00102869यानं', 41),\n",
" ('अिधकच', 41),\n",
" ('मिू', 41),\n",
" ('घे\\U00102860यासाठी', 40),\n",
" ('रथात', 40),\n",
" ('प्रवेश', 40),\n",
" ('करायला', 40),\n",
" ('सावरीत', 40),\n",
" ('कदािचत', 40),\n",
" ('उत\\U001028f6न', 40),\n",
" ('ोणानं', 40),\n",
" ('उठला', 40),\n",
" ('िनण\\U00102712य', 40),\n",
" ('वाटला', 40),\n",
" ('श्रे\\U0010286fठ', 40),\n",
" ('दात', 40),\n",
" ('नसतो', 40),\n",
" ('कुणीही', 40),\n",
" ('संधी', 40),\n",
" ('आहात', 40),\n",
" ('ठेवला', 40),\n",
" ('लागतं', 39),\n",
" ('हातांनी', 39),\n",
" ('जीवना\\U0010285cया', 39),\n",
" ('कानात', 39),\n",
" ('नगर', 39),\n",
" ('ये', 39),\n",
" ('पण\\U00102712कुटीत', 39),\n",
" ('अ\\U00102870व\\U00102870थ', 39),\n",
" ('पात्र', 39),\n",
" ('रािहलो', 39),\n",
" ('मुदर्', 39),\n",
" ('आसनावर', 39),\n",
" ('चौथ\\U00102876यावर', 39),\n",
" ('कर', 39),\n",
" ('राहत', 39),\n",
" ('कीती', 39),\n",
" ('अप\\U00102712ण', 39),\n",
" ('थाबं', 39),\n",
" ('मदर्', 39),\n",
" ('\\U00102855ुदर्', 39),\n",
" ('जयतु', 39),\n",
" ('िशिबरात', 39),\n",
" ('आठ', 38),\n",
" ('वारंवार', 38),\n",
" ('तलम', 38),\n",
" ('कान', 38),\n",
" ('जीवनाची', 38),\n",
" ('रे', 38),\n",
" ('समाधान', 38),\n",
" ('मारली', 38),\n",
" ('श\\U00102870त्र', 38),\n",
" ('यांनी', 38),\n",
" ('कु\\U001028f6', 38),\n",
" ('धनुध\\U00102712र', 38),\n",
" ('मनाची', 38),\n",
" ('अपमान', 38),\n",
" ('वचन', 38),\n",
" ('राजकुमारी', 38),\n",
" ('श्रीकृ\\U0010286fणानं', 38),\n",
" ('पांचाली', 38),\n",
" ('तशा', 37),\n",
" ('गदागदा', 37),\n",
" ('घटना', 37),\n",
" ('पावलं', 37),\n",
" ('मा\\U001028f6न', 37),\n",
" ('ऐटदार', 37),\n",
" ('बसलो', 37),\n",
" ('पात्रात', 37),\n",
" ('आ\\U00102869हा', 37),\n",
" ('मा\\U0010285eयावर', 37),\n",
" ('सदैव', 37),\n",
" ('काहीही', 37),\n",
" ('\\U00102870वरात', 37),\n",
" ('गंगा', 37),\n",
" ('एकाएकी', 37),\n",
" ('लाकडी', 37),\n",
" ('भगवान', 37),\n",
" ('शांतता', 37),\n",
" ('ाचं', 37),\n",
" ('जावं', 37),\n",
" ('घोर', 37),\n",
" ('उचलून', 37),\n",
" ('घायाळ', 37),\n",
" ('\\U00102855ण', 37),\n",
" ('जीवनात', 37),\n",
" ('धात्री', 37),\n",
" ('बोलू', 37),\n",
" ('द\\U001028f5्', 37),\n",
" ('सात', 36),\n",
" ('कोणी', 36),\n",
" ('जीव', 36),\n",
" ('करायचं', 36),\n",
" ('िमळेल', 36),\n",
" ('दाट', 36),\n",
" ('आमची', 36),\n",
" ('अरे', 36),\n",
" ('इतकं', 36),\n",
" ('दंड', 36),\n",
" ('जाई', 36),\n",
" ('पव\\U00102712त', 36),\n",
" ('कसे', 36),\n",
" ('िनरोप', 36),\n",
" ('नुसतं', 36),\n",
" ('घुसला', 36),\n",
" ('दृ\\U0010286fटीनं', 36),\n",
" ('गो\\U0010286fट', 36),\n",
" ('अ\\U00102870प\\U0010286fट', 36),\n",
" ('हळूहळू', 36),\n",
" ('एकटा', 36),\n",
" ('रािहली', 36),\n",
" ('तयारी', 36),\n",
" ('आणून', 36),\n",
" ('\\U00102855णा\\U00102855णाला', 36),\n",
" ('भुवया', 36),\n",
" ('धृतरा\\U0010286fट्र', 36),\n",
" ('\\U00102870पश\\U00102712', 36),\n",
" ('आ\\U0010286eचय\\U00102712', 36),\n",
" ('डे', 36),\n",
" ('िमळणार', 36),\n",
" ('करीन', 36),\n",
" ('अिजं\\U00102858य', 36),\n",
" ('केले\\U0010286bया', 36),\n",
" ('दे\\U00102860यासाठी', 36),\n",
" ('सहन', 36),\n",
" ('माणसं', 35),\n",
" ('जगाला', 35),\n",
" ('माणसाला', 35),\n",
" ('ितकडे', 35),\n",
" ('पातर्', 35),\n",
" ('\\U00102861या\\U0010285cयाकडे', 35),\n",
" ('वृ\\U00102855', 35),\n",
" ('समोर\\U0010285cया', 35),\n",
" ('शांतपणे', 35),\n",
" ('शंका', 35),\n",
" ('चालत', 35),\n",
" ('िफरत', 35),\n",
" ('समजलं', 35),\n",
" ('अंग', 35),\n",
" ('लागेल', 35),\n",
" ('के\\U0010286dहा', 35),\n",
" ('पडलो', 35),\n",
" ('य', 35),\n",
" ('व\\U00102870तर्', 35),\n",
" ('भीमानं', 35),\n",
" ('घट्ट', 35),\n",
" ('मामा', 35),\n",
" ('मा\\U00102864य', 35),\n",
" ('\\U0010285cछ\\U00102712त', 35),\n",
" ('ध\\U00102864य', 35),\n",
" ('जयदर्', 35),\n",
" ('दादानं', 35),\n",
" ('लोक', 34),\n",
" ('घोड\\U00102710ां\\U0010285cया', 34),\n",
" ('\\U00102869हणतात', 34),\n",
" ('आई', 34),\n",
" ('लागलीच', 34),\n",
" ('वसू', 34),\n",
" ('जे\\U0010286dहा', 34),\n",
" ('झालेली', 34),\n",
" ('सव\\U00102712श्रे\\U0010286fठ', 34),\n",
" ('बर्', 34),\n",
" ('असलं', 34),\n",
" ('िप्रय', 34),\n",
" ('दीघ\\U00102712', 34),\n",
" ('िफ\\U001028f6', 34),\n",
" ('जो', 34),\n",
" ('िववाह', 34),\n",
" ('पिहला', 34),\n",
" ('\\U00102855णातच', 34),\n",
" ('दोष', 34),\n",
" ('लाल', 34),\n",
" ('घनदाट', 34),\n",
" ('यानं', 34),\n",
" ('अथान\\U00102712', 34),\n",
" ('प्रसंगी', 34),\n",
" ('शेकडो', 34),\n",
" ('मागून', 34),\n",
" ('गाठ', 34),\n",
" ('धारण', 34),\n",
" ('मा\\U00102862यावर', 34),\n",
" ('कवड\\U00102710ा', 34),\n",
" ('बारा', 33),\n",
" ('पंधरा', 33),\n",
" ('सांगतो', 33),\n",
" ('अशीच', 33),\n",
" ('अंितम', 33),\n",
" ('असावेत', 33),\n",
" ('गंगेचं', 33),\n",
" ('आमचं', 33),\n",
" ('अ\\U00102861यंत', 33),\n",
" ('मोठ\\U00102710ा', 33),\n",
" ('ऐकताच', 33),\n",
" ('भयाण', 33),\n",
" ('समुदर्', 33),\n",
" ('हाच', 33),\n",
" ('िवचारला', 33),\n",
" ('तीच', 33),\n",
" ('आकाश', 33),\n",
" ('कसला', 33),\n",
" ('नसतात', 33),\n",
" ('ाच', 33),\n",
" ('सांगू', 33),\n",
" ('घोडा', 33),\n",
" ('शकणार', 33),\n",
" ('पाठ', 33),\n",
" ('ख\\U00102876या', 33),\n",
" ('अवजड', 33),\n",
" ('तूच', 33),\n",
" ('ा\\U0010285cया', 33),\n",
" ('कं', 33),\n",
" ('गंगेवर', 33),\n",
" ('संप\\U00102898ी', 33),\n",
" ('देश', 33),\n",
" ('पडून', 32),\n",
" ('याच', 32),\n",
" ('येई', 32),\n",
" ('ाठी', 32),\n",
" ('सतत', 32),\n",
" ('िध\\U00102865पाड', 32),\n",
" ('एवढ\\U00102710ा', 32),\n",
" ('टाकीत', 32),\n",
" ('प्रकाश', 32),\n",
" ('ताडकन', 32),\n",
" ('हळूच', 32),\n",
" ('ओंजळीत', 32),\n",
" ('चाललो', 32),\n",
" ('बोलावून', 32),\n",
" ('र\\U00102858त', 32),\n",
" ('धारदार', 32),\n",
" ('कुठंच', 32),\n",
" ('उंचावीत', 32),\n",
" ('े\\U0010286fठ', 32),\n",
" ('राजवाड\\U00102710ावर', 32),\n",
" ('कपाळावर', 32),\n",
" ('वळून', 32),\n",
" ('काम', 32),\n",
" ('हजारो', 32),\n",
" ('सतू', 32),\n",
" ('िफरवीत', 32),\n",
" ('इत\\U00102858या', 32),\n",
" ('\\U001028a2ूत', 32),\n",
" ('राजसभेत', 32),\n",
" ('मनानं', 31),\n",
" ('अिभमान', 31),\n",
" ('मृ\\U00102861यू', 31),\n",
" ('अश्\\U001028f6', 31),\n",
" ('अिवरत', 31),\n",
" ('जरी', 31),\n",
" ('बघ', 31),\n",
" ('आवाजात', 31),\n",
" ('थोड\\U00102710ा', 31),\n",
" ('उदग्', 31),\n",
" ('िठकाणी', 31),\n",
" ('एकत्र', 31),\n",
" ('क\\U0010286bपनेनं', 31),\n",
" ('\\U001028bcा', 31),\n",
" ('ठीक', 31),\n",
" ('आजपयं\\U00102712त', 31),\n",
" ('कवच-कुंडलं', 31),\n",
" ('िमळालं', 31),\n",
" ('असला', 31),\n",
" ('सूचना', 31),\n",
" ('कोणीच', 31),\n",
" ('हवं', 31),\n",
" ('कशासाठी', 31),\n",
" ('राजपुत्र', 31),\n",
" ('प्राण', 31),\n",
" ('येणं', 31),\n",
" ('सहदेव', 31),\n",
" ('भीमाला', 31),\n",
" ('घोषणा', 31),\n",
" ('राहील', 31),\n",
" ('अथ\\U00102712', 31),\n",
" ('\\U00102857दय', 31),\n",
" ('संदेश', 31),\n",
" ('िशशुपाल', 31),\n",
" ('दानवीर', 31),\n",
" ('सखोल', 30),\n",
" ('हीच', 30),\n",
" ('डो\\U0010286cयांसमोर', 30),\n",
" ('काठी', 30),\n",
" ('आमचे', 30),\n",
" ('िबंद', 30),\n",
" ('नसते', 30),\n",
" ...]"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_x"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"268084"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(words)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"184"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(vocab)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(' ', 201446),\n",
" ('ा', 164571),\n",
" ('त', 60872),\n",
" ('ी', 48952),\n",
" ('र', 48444),\n",
" ('य', 46521),\n",
" ('ं', 45440),\n",
" ('न', 40441),\n",
" ('ल', 39702),\n",
" ('े', 38806),\n",
" ('क', 37535),\n",
" ('व', 34470),\n",
" ('ह', 33721),\n",
" ('स', 28661),\n",
" ('म', 28350),\n",
" ('प', 25572),\n",
" ('च', 24455),\n",
" ('ि', 23890),\n",
" ('ो', 22533),\n",
" ('\\n', 22402),\n",
" ('ण', 20496),\n",
" ('ु', 17370),\n",
" ('द', 14891),\n",
" ('ग', 14421),\n",
" ('आ', 14303),\n",
" ('ू', 13589),\n",
" ('ज', 13127),\n",
" ('.', 12853),\n",
" ('ड', 11149),\n",
" ('्', 10969),\n",
" ('अ', 10633),\n",
" ('\\U00102861', 10440),\n",
" ('श', 10219),\n",
" ('!', 9384),\n",
" ('\\U00102712', 8868),\n",
" (',', 8637),\n",
" ('\\U0010285c', 8124),\n",
" ('ळ', 8060),\n",
" ('ट', 7814),\n",
" ('ध', 7269),\n",
" ('भ', 6634),\n",
" ('ब', 6411),\n",
" ('ख', 5381),\n",
" ('ठ', 5329),\n",
" ('‘', 5324),\n",
" ('\\U00102870', 4944),\n",
" ('घ', 4665),\n",
" ('थ', 4230),\n",
" ('\\U0010286b', 4132),\n",
" ('झ', 3839),\n",
" ('उ', 3820),\n",
" ('ए', 3779),\n",
" ('\\U00102869', 3496),\n",
" ('’', 3439),\n",
" ('\\U001028f6', 2886),\n",
" ('\\U00102855', 2842),\n",
" ('?', 2645),\n",
" ('\\U0010286d', 2547),\n",
" ('ृ', 2540),\n",
" ('\\U0010286f', 2383),\n",
" ('ढ', 2216),\n",
" ('\\U0010285e', 2153),\n",
" ('\\U00102860', 2060),\n",
" ('फ', 1826),\n",
" ('\\U00102864', 1760),\n",
" ('ष', 1737),\n",
" ('ऊ', 1570),\n",
" ('\\U00102710', 1432),\n",
" ('\\U00102858', 1390),\n",
" ('\\U0010286e', 1375),\n",
" ('\\U00102876', 1354),\n",
" ('\\U0010289d', 1317),\n",
" ('-', 1159),\n",
" ('ै', 1071),\n",
" ('”', 1028),\n",
" ('–', 1006),\n",
" ('\\U001028f5', 949),\n",
" ('\\U0010285d', 938),\n",
" ('\\U0010286c', 909),\n",
" ('इ', 897),\n",
" ('ौ', 843),\n",
" ('\\U001028a2', 843),\n",
" (':', 841),\n",
" ('\\U00102898', 828),\n",
" ('\\U00102707', 725),\n",
" ('ओ', 713),\n",
" ('ई', 697),\n",
" ('छ', 619),\n",
" ('\\U0010285a', 604),\n",
" ('ऽ', 604),\n",
" ('\\U00102856', 565),\n",
" ('\\U00102863', 546),\n",
" ('\\U001028a3', 527),\n",
" ('ऐ', 447),\n",
" ('*', 440),\n",
" ('\\U00102859', 430),\n",
" ('\\U00102920', 413),\n",
" ('\\U00102865', 411),\n",
" ('\\U00102862', 404),\n",
" ('\\U00102867', 301),\n",
" (';', 264),\n",
" ('\\U00102709', 182),\n",
" ('ॠ', 168),\n",
" ('\\U0010285b', 163),\n",
" ('\\U001028bb', 154),\n",
" ('\\U0010289c', 149),\n",
" ('“', 148),\n",
" ('\\U0010286a', 114),\n",
" ('\\U001028bc', 105),\n",
" ('\\U00102857', 91),\n",
" ('\\U00102868', 81),\n",
" ('\\U00102921', 73),\n",
" ('९', 55),\n",
" ('\\U001028bf', 52),\n",
" ('औ', 51),\n",
" ('१', 41),\n",
" ('\\U001028c5', 36),\n",
" ('ः', 33),\n",
" ('०', 27),\n",
" ('२', 20),\n",
" ('\\U00102708', 18),\n",
" ('(', 17),\n",
" (')', 17),\n",
" ('८', 16),\n",
" ('६', 14),\n",
" ('•', 14),\n",
" ('ॐ', 12),\n",
" ('५', 12),\n",
" ('४', 10),\n",
" ('e', 10),\n",
" ('ँ', 10),\n",
" ('ऋ', 9),\n",
" ('३', 8),\n",
" ('७', 8),\n",
" ('\\U001011bc', 8),\n",
" ('\\U00101227', 8),\n",
" ('ॉ', 7),\n",
" ('i', 7),\n",
" ('o', 7),\n",
" ('\\U0010270a', 7),\n",
" ('h', 6),\n",
" ('s', 6),\n",
" ('\\U00102854', 6),\n",
" ('\\U001011fc', 6),\n",
" ('m', 5),\n",
" ('\\U001011ff', 5),\n",
" ('ङ', 5),\n",
" ('\\U001011f0', 4),\n",
" ('n', 4),\n",
" ('u', 4),\n",
" ('a', 3),\n",
" ('l', 3),\n",
" ('t', 3),\n",
" ('b', 3),\n",
" ('c', 3),\n",
" ('w', 3),\n",
" ('\\U0010289f', 3),\n",
" ('—', 3),\n",
" ('f', 2),\n",
" ('p', 2),\n",
" ('g', 2),\n",
" ('B', 2),\n",
" ('7', 2),\n",
" ('6', 2),\n",
" ('\\U001011f6', 2),\n",
" ('©', 1),\n",
" ('E', 1),\n",
" ('G', 1),\n",
" ('W', 1),\n",
" ('I', 1),\n",
" ('S', 1),\n",
" ('N', 1),\n",
" ('8', 1),\n",
" ('1', 1),\n",
" ('\\U001011e7', 1),\n",
" ('\\U00102866', 1),\n",
" ('॓', 1),\n",
" ('\\U0010270f', 1),\n",
" ('\\U001011fe', 1),\n",
" ('R', 1),\n",
" ('r', 1),\n",
" ('k', 1),\n",
" ('\\U001011f3', 1),\n",
" ('/', 1)]"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1 = Counter(text)\n",
"sorted_x1 = sorted(x1.items(), key=operator.itemgetter(1), reverse=True)\n",
"sorted_x1"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter({'म': 28350,\n",
" 'ृ': 2540,\n",
" '\\U00102861': 10440,\n",
" 'य': 46521,\n",
" 'ु': 17370,\n",
" 'ं': 45440,\n",
" 'ज': 13127,\n",
" '\\n': 22402,\n",
" '\\U001011f0': 4,\n",
" 'ि': 23890,\n",
" 'श': 10219,\n",
" 'व': 34470,\n",
" 'ा': 164571,\n",
" 'ी': 48952,\n",
" ' ': 201446,\n",
" 'स': 28661,\n",
" 'त': 60872,\n",
" '©': 1,\n",
" 'प': 25572,\n",
" 'र': 48444,\n",
" '्': 10969,\n",
" 'क': 37535,\n",
" 'न': 40441,\n",
" 'ल': 39702,\n",
" 'अ': 10633,\n",
" 'े': 38806,\n",
" 'ह': 33721,\n",
" ',': 8637,\n",
" '\\U00102867': 301,\n",
" 'ग': 14421,\n",
" 'ऊ': 1570,\n",
" '१': 41,\n",
" '९': 55,\n",
" '४': 10,\n",
" 'द': 14891,\n",
" 'ठ': 5329,\n",
" 'ड': 11149,\n",
" 'ॉ': 7,\n",
" 'ण': 20496,\n",
" '३': 8,\n",
" '०': 27,\n",
" '.': 12853,\n",
" '‘': 5324,\n",
" '२': 20,\n",
" '-': 1159,\n",
" '७': 8,\n",
" '६': 14,\n",
" 'E': 1,\n",
" 'm': 5,\n",
" 'a': 3,\n",
" 'i': 7,\n",
" 'l': 3,\n",
" ':': 841,\n",
" 'n': 4,\n",
" 'f': 2,\n",
" 'o': 7,\n",
" 'G': 1,\n",
" 'e': 10,\n",
" 'h': 6,\n",
" 't': 3,\n",
" 'p': 2,\n",
" 'u': 4,\n",
" 'b': 3,\n",
" 's': 6,\n",
" 'g': 2,\n",
" 'c': 3,\n",
" 'W': 1,\n",
" 'w': 3,\n",
" 'आ': 14303,\n",
" '\\U00102898': 828,\n",
" 'I': 1,\n",
" 'S': 1,\n",
" 'B': 2,\n",
" 'N': 1,\n",
" '8': 1,\n",
" '1': 1,\n",
" '7': 2,\n",
" '6': 2,\n",
" 'भ': 6634,\n",
" 'ू': 13589,\n",
" '\\U0010285c': 8124,\n",
" '\\U00102855': 2842,\n",
" 'थ': 4230,\n",
" '\\U00102712': 8868,\n",
" 'ध': 7269,\n",
" 'ए': 3779,\n",
" 'ो': 22533,\n",
" 'च': 24455,\n",
" 'ब': 6411,\n",
" 'ौ': 843,\n",
" 'ळ': 8060,\n",
" '\\U0010286d': 2547,\n",
" '\\U00102870': 4944,\n",
" 'छ': 619,\n",
" '\\U00102858': 1390,\n",
" 'ष': 1737,\n",
" '\\U0010286f': 2383,\n",
" '\\U001011bc': 8,\n",
" '!': 9384,\n",
" '\\U00102869': 3496,\n",
" '?': 2645,\n",
" 'ख': 5381,\n",
" 'इ': 897,\n",
" ';': 264,\n",
" '\\U0010285e': 2153,\n",
" 'ढ': 2216,\n",
" 'उ': 3820,\n",
" 'झ': 3839,\n",
" 'घ': 4665,\n",
" 'ट': 7814,\n",
" '\\U0010285d': 938,\n",
" '–': 1006,\n",
" '\\U0010286c': 909,\n",
" '\\U0010286b': 4132,\n",
" '\\U0010285b': 163,\n",
" '\\U00102862': 404,\n",
" '\\U001028f5': 949,\n",
" '\\U00102876': 1354,\n",
" 'ओ': 713,\n",
" 'ऐ': 447,\n",
" '\\U00102860': 2060,\n",
" '\\U0010285a': 604,\n",
" '\\U00102864': 1760,\n",
" '’': 3439,\n",
" '\\U00102863': 546,\n",
" 'फ': 1826,\n",
" '\\U001028f6': 2886,\n",
" '\\U00102710': 1432,\n",
" '\\U0010286e': 1375,\n",
" '\\U001028a2': 843,\n",
" 'ै': 1071,\n",
" '\\U00102859': 430,\n",
" '*': 440,\n",
" '\\U001028a3': 527,\n",
" '\\U0010289d': 1317,\n",
" '\\U00102709': 182,\n",
" 'ई': 697,\n",
" '\\U00102920': 413,\n",
" '\\U00102707': 725,\n",
" '\\U00102856': 565,\n",
" '\\U0010289c': 149,\n",
" 'ऽ': 604,\n",
" '\\U00102865': 411,\n",
" '\\U00102921': 73,\n",
" '”': 1028,\n",
" '“': 148,\n",
" 'ॠ': 168,\n",
" '\\U001028bb': 154,\n",
" 'औ': 51,\n",
" '\\U001028bc': 105,\n",
" '\\U00102868': 81,\n",
" '\\U00102854': 6,\n",
" 'ः': 33,\n",
" '\\U001028c5': 36,\n",
" 'ॐ': 12,\n",
" '\\U0010286a': 114,\n",
" '\\U001028bf': 52,\n",
" '\\U001011fc': 6,\n",
" '\\U00101227': 8,\n",
" '\\U0010270a': 7,\n",
" '\\U001011ff': 5,\n",
" '\\U00102857': 91,\n",
" '\\U00102708': 18,\n",
" '\\U001011e7': 1,\n",
" '\\U00102866': 1,\n",
" '\\U0010289f': 3,\n",
" 'ऋ': 9,\n",
" 'ङ': 5,\n",
" '॓': 1,\n",
" 'ँ': 10,\n",
" '—': 3,\n",
" '\\U0010270f': 1,\n",
" '\\U001011fe': 1,\n",
" '\\U001011f6': 2,\n",
" 'R': 1,\n",
" 'r': 1,\n",
" 'k': 1,\n",
" '\\U001011f3': 1,\n",
" '•': 14,\n",
" '(': 17,\n",
" '८': 16,\n",
" ')': 17,\n",
" '५': 12,\n",
" '/': 1})"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter(text)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"# Batch size\n",
"BATCH_SIZE = 64\n",
"\n",
"BATCH_SIZE = 16\n",
"\n",
"\n",
"# Buffer size to shuffle the dataset\n",
"# (TF data is designed to work with possibly infinite sequences,\n",
"# so it doesn't attempt to shuffle the entire sequence in memory. Instead,\n",
"# it maintains a buffer in which it shuffles elements).\n",
"BUFFER_SIZE = 10000\n",
"\n",
"dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)\n",
"# Length of the vocabulary in chars\n",
"vocab_size = len(vocab)\n",
"\n",
"# The embedding dimension\n",
"embedding_dim = 256\n",
"\n",
"# Number of RNN units\n",
"rnn_units = 1024\n",
"\n",
"# The embedding dimension\n",
"#embedding_dim = 16\n",
"\n",
"# Number of RNN units\n",
"#rnn_units = 32\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: \"sequential_5\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_5 (Embedding) (16, None, 256) 47104 \n",
"_________________________________________________________________\n",
"gru_5 (GRU) (16, None, 1024) 3938304 \n",
"_________________________________________________________________\n",
"dense_5 (Dense) (16, None, 184) 188600 \n",
"=================================================================\n",
"Total params: 4,174,008\n",
"Trainable params: 4,174,008\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
]
}
],
"source": [
"model = build_model(\n",
" vocab_size = len(vocab),\n",
" embedding_dim=embedding_dim,\n",
" rnn_units=rnn_units,\n",
" batch_size=BATCH_SIZE)\n",
"model.summary()\n",
"model.compile(optimizer='adam', loss=loss)\n"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(16, 30, 184) # (batch_size, sequence_length, vocab_size)\n"
]
}
],
"source": [
"for input_example_batch, target_example_batch in dataset.take(1):\n",
" example_batch_predictions = model(input_example_batch)\n",
" print(example_batch_predictions.shape, \"# (batch_size, sequence_length, vocab_size)\")"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)\n",
"sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()\n",
"vexample_batch_loss = loss(target_example_batch, example_batch_predictions)\n",
"# Directory where the checkpoints will be saved\n",
"checkpoint_dir = './training_checkpoints_mar'\n",
"# Name of the checkpoint files\n",
"checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt_{epoch}\")\n",
"\n",
"checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(\n",
" filepath=checkpoint_prefix,\n",
" save_weights_only=True)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train for 2768 steps\n",
"Epoch 1/10\n",
"2768/2768 [==============================] - 1205s 435ms/step - loss: 2.0953\n",
"Epoch 2/10\n",
"2768/2768 [==============================] - 1082s 391ms/step - loss: 1.7233\n",
"Epoch 3/10\n",
"2768/2768 [==============================] - 1008s 364ms/step - loss: 1.6323\n",
"Epoch 4/10\n",
"2768/2768 [==============================] - 9572s 3s/step - loss: 1.5810\n",
"Epoch 5/10\n",
"2768/2768 [==============================] - 929s 336ms/step - loss: 1.5473\n",
"Epoch 6/10\n",
"2768/2768 [==============================] - 920s 332ms/step - loss: 1.5256\n",
"Epoch 7/10\n",
"2768/2768 [==============================] - 921s 333ms/step - loss: 1.5119\n",
"Epoch 8/10\n",
"2768/2768 [==============================] - 924s 334ms/step - loss: 1.5063\n",
"Epoch 9/10\n",
"2768/2768 [==============================] - 928s 335ms/step - loss: 1.5060\n",
"Epoch 10/10\n",
"2768/2768 [==============================] - 895s 323ms/step - loss: 1.5094\n"
]
}
],
"source": [
"EPOCHS=10\n",
"#EPOCHS=5\n",
"history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])\n"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'./training_checkpoints_mar/ckpt_10'"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf.train.latest_checkpoint(checkpoint_dir)\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: \"sequential_6\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_6 (Embedding) (1, None, 256) 47104 \n",
"_________________________________________________________________\n",
"gru_6 (GRU) (1, None, 1024) 3938304 \n",
"_________________________________________________________________\n",
"dense_6 (Dense) (1, None, 184) 188600 \n",
"=================================================================\n",
"Total params: 4,174,008\n",
"Trainable params: 4,174,008\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
]
}
],
"source": [
"model2 = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)\n",
"\n",
"model2.load_weights(tf.train.latest_checkpoint(checkpoint_dir))\n",
"\n",
"model2.build(tf.TensorShape([1, None]))\n",
"model2.summary()\n"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"जाणार! ते\n",
"छायेचाही अव􂡰थेत दानवमा􂜒ल, अिवशे पटलून िमळाले न􂡭हते. शु􂢝ीवर\n",
"बसले होते. 􂡡यांतील पी 􂡡या􂡜या दीन एका िन:श􂡧द भुवया\n",
"उडिवला! तो सुखदा आत ज􂡤मला!\n",
"‘‘या सताना मला जीवनातील गो􂡫प 􂡩हणजे ब्र􂢻से! गु􂣶 दर् ोणाचं ी अनुथ􂣶 ू कुंतीदेवी – दानशू􂡤य तूरासह अं􂡡कार मराठी! आडवानं􂜒 ा चंदार\n",
"िदसले􂡫या... धातर् ी एकिच􂡯णाचं 􂜒\n",
"कणाप􂜒 ाठी आणलेलं दु:ख िलघून मला 􂡩हून\n",
"गेला. 􂡡याचे तो मराठी तयारा होता! मुख\n",
"बरोबर\n",
"घेणार होता! तो अमृताश झालेला कुणाचा सैिनक मना􂡜या शुभ्र फुगवून तो वाड􂜐ावर प्रव शोधून के ले. काना􂡜या फु रस\n",
"िनराझे उडीिकत आपला\n",
"िदि􂡚वजयी मान उंच? छे, कुठंही पांडवानं ा पडला आहे अंगराज कण􂜒ऽऽ आवाजा􂡜या भा􂡭यासाठी – का􂣶न द􂡚ध वष􂜒वू राजदंड आपोआप उंच उडाले. तलवती गंगे परत आलाच तसं जवून पुन:पु􂡤हा 􂡡याचीच\n",
"नाही कृ तवत:च, सािमश􂡰तत्राचे पट्टे आलं!\n",
"‘‘आ􂡖ाधार􂜒 ानं ी मला अनुलवलं जातं! जीव􂡮यक होते. सवांत􂜒 वायला आला आहे.! 􂡡याचा पांडव\n",
"खांदेवढी िफरवत\n",
"गदा! रथा􂢘ी सलो􂡫होयु􂢝 तेच कळे.\n",
"‘‘तु􂡞याशी थांबिवला होता मी 􂡡या􂡜याशी लढलो. 􂡡याची-\n",
"पवाऐका एकमेकांवर घामाचे बथक रथिव􂣶न फळं चतु􂣶्रदेन घेऊन परत मोकळी वळणं घायाळ\n",
"होऊ लागला. भीम-गदे􂡜या स􂡭हांनाहीस􂡤मळतो के\n"
]
}
],
"source": [
"#print(generate_text(model, start_string=u\"ROMEO: \"))\n",
"\n",
"print(generate_text(model2, start_string=u\"जाणार\"))\n"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"काही पडदल तोडणारी एका शु􂢝ाचा डतू धनु􂡯य\n",
"फेकून िदली! संतोष 􂢢ाव मलालाच असं 􂡡यां􂡜या\n",
"िश􂡫पातुमांनी तो गोल अ􂡛यां􂡜च सूय􂜒पुत्राची िचव􂣵िवतभर कु􂣵􂡕ेत्र आिण दया शत करीत बपु􂡤हा पण􂜒कु टीसमोर\n",
"के वळ आप􂡫याकडे हळूहळू पात्रं घेऊन कसंतरीच\n",
"थरथ􂣶 लागला! ‘‘मातािम􂜒पून तळपू फुगला! हरीत आणून राहणार न􂡭हता! 􂡡याला मग तो\n",
"नेहमी आपलं जीवनाला जगून ियशेवीट􂜐ा सवां􂜒वर एक\n",
"स􂡥तमानही आहे, हे माझे हात होते!\n",
"िदवस􂜉िदवस असेल तं ते 􂢢ूतानं?\n",
"पिहला प्रय􂡡न 􂡡यां􂡜या अंगाल􂜒 की तु􂡞या दाट\n",
"भावनांप􂢘ी भीमपुतर् ां􂡜या संकेत ठाकलेला िदि􂡚वजयमानं मावळ􂡡या 􂡰त􂡧धा􂡜या\n",
"समवाराचा.’’\n",
"‘‘दुया􂜇धनासाठी काय-􂡰फुंदताच होणार नाही. तो कुणाचाच बोल सोनेरी बुंज-\n",
"पायदंडया बंधू􂡜या मुखावर चढून,\n",
"अपाप, यमुना सहज तडका मा􂡞या डो􂡬यां􂡜या\n",
"पा􂡬याकडे धावू लागलो.\n",
"पांडवां􂡜या 􂢣ारात कास मी\n",
"􂡕ितृहयेत काय ते समज करायचं न􂡭हतं का?\n",
"􂡝या वेळी 􂡡याचे चमकारे\n",
"वेधनापुढं\n",
"सरळ िन􂡬याभ्रमाला घेऊन मी तजागला!\n",
"वाकावत हे अ􂡕रश: जाणार न􂡭ह􂡡या.\n",
"‘अजु􂜒नांनी पाठवून ठेवून! मला एक-उलदा􂡜या मरासमुद􂜒 यातना यांचा डताचे सव􂜒 सामा􂡤य आहे! तु􂡩हाला आ􂡖ाधारक जण आता\n",
"केवळ जरदैय खड्गं दरू देतो मीही\n",
"􂡡या􂡜या आवाजा􂡜या मना􂡜या हळे फुटला!! हे\n"
]
}
],
"source": [
"print(generate_text(model2, start_string=u\"काही\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment