Skip to content

Instantly share code, notes, and snippets.

@blabadi
Created October 5, 2018 12:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blabadi/d5bfa27f98068ad1d6ae27f9325096ba to your computer and use it in GitHub Desktop.
Save blabadi/d5bfa27f98068ad1d6ae27f9325096ba to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Dev\\Anaconda3\\envs\\gensim_env\\lib\\site-packages\\gensim\\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
" warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n"
]
}
],
"source": [
"import logging\n",
"from six import iteritems\n",
"from smart_open import smart_open\n",
"import gensim\n",
"from gensim import corpora, models, similarities\n",
"from gensim.models import TfidfModel\n",
"import gensim.downloader as api\n",
"from multiprocessing import cpu_count\n",
"from gensim.similarities import MatrixSimilarity, WmdSimilarity, SoftCosineSimilarity\n",
"import numpy as np\n",
"from nltk.corpus import stopwords\n",
"from nltk import download"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"documents = []\n",
"dictionary = None\n",
"tfidf = None\n",
"corpus_tfidf = None\n",
"corpus = None\n",
"\n",
"def load_documents():\n",
" documents = list(line.lower() for line in smart_open('booksummaries_cleaned.txt', 'rb'))\n",
" return documents"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# method to remove common repetitive english words (is, he, she, the, in etc)\n",
"def get_stopwords():\n",
" # Download stopwords list.\n",
" download('stopwords') \n",
" # Remove stopwords.\n",
" stop_words = stopwords.words('english')\n",
" return stop_words"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def build_dict():\n",
" stop_words = get_stopwords()\n",
" dictionary = corpora.Dictionary(gensim.utils.simple_preprocess(line) for line in documents)\n",
" # remove stop words and words that appear only once\n",
" stop_ids = [dictionary.token2id[stopword] for stopword in stop_words \n",
" if stopword in dictionary.token2id]\n",
" \n",
" # since we deal with book titles I kept the one words in our dict\n",
" once_ids = [] #[tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]\n",
"\n",
" # remove stop words and words that appear only once\n",
" dictionary.filter_tokens(stop_ids + once_ids)\n",
" dictionary.save('./dictionary')\n",
" return dictionary\n",
"def load_dict():\n",
" return corpora.Dictionary.load('./dictionary')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"class MySentences(object):\n",
" def __init__(self, filename):\n",
" self.filename = filename\n",
" \n",
" def __iter__(self):\n",
" for line in open(self.filename, encoding=\"iso-8859-1\"):\n",
" yield gensim.utils.simple_preprocess(line)\n",
" \n",
"def read_sentences(fname):\n",
" with smart_open(fname, encoding=\"iso-8859-1\") as f:\n",
" for i, line in enumerate(f):\n",
" yield gensim.utils.simple_preprocess(line)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"def build_corpus():\n",
" corpus = list(dictionary.doc2bow(gensim.utils.simple_preprocess(document)) for document in documents)\n",
" return corpus"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def build_tfidf(corpus, dictionary):\n",
" t = TfidfModel(corpus, dictionary=dictionary)\n",
" t.save('./tfidf')\n",
" return t\n",
"def load_tfidf():\n",
" return TfidfModel.load('./tfidf')"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"#tfidf = build_tfidf(corpus, dictionary)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>Load Existing Metadata</h1>"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"16559"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"documents = load_documents()\n",
"len(documents)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"sentences = MySentences('booksummaries_cleaned.txt')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"dictionary = load_dict() "
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"# build_corpus()\n",
"# build_tfidf()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"#corpus = build_corpus()\n",
"tfidf = load_tfidf()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>Corpus tfidf</h2>"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'NoneType' object is not iterable",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-15-e0c708a03a75>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcorpus_tfidf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtfidf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32mC:\\Dev\\Anaconda3\\envs\\gensim_env\\lib\\site-packages\\gensim\\models\\tfidfmodel.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, bow, eps)\u001b[0m\n\u001b[0;32m 370\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 371\u001b[0m \u001b[0mtermid_array\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtf_array\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 372\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mtermid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtf\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mbow\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 373\u001b[0m \u001b[0mtermid_array\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtermid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 374\u001b[0m \u001b[0mtf_array\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not iterable"
]
}
],
"source": [
"corpus_tfidf = tfidf[corpus]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1> Cosin Similarity </h1>"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"def cossim():\n",
" index = MatrixSimilarity(\n",
" corpus_tfidf,\n",
" num_features=len(dictionary))\n",
" index.save('cossim_index')\n",
" return index"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"cossim_index = MatrixSimilarity.load('cossim_index') #cossim()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1> Model evaluation </h1>"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"query = \"Animal Farm\"\n",
"query_vec = tfidf[dictionary.doc2bow(query.lower().split())]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# cosin sim\n",
"sim = cossim_index[query_vec]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"maxSimilarIndx : 9989 similarties_sc[maxSimilarIndx]:0.4097048\n"
]
}
],
"source": [
"maxSimilarIndx = np.argmax(sim)\n",
"sim[maxSimilarIndx]\n",
"print(\"maxSimilarIndx : \" + str(maxSimilarIndx) + \" similarties_sc[maxSimilarIndx]:\" + str(sim[maxSimilarIndx]))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"sorted_sim = sorted(enumerate(sim), key=lambda item: -item[1])"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" #1 - doc# 9989 - (40.97048044204712) - Doc:\n",
" b'11541759\\t3t\\tsnowball\\'s chance\\tjohn reed\\t\\t{ \"parody\"}\\t the story begins with the death of napoleon, the original antagonist of animal farm. the animals of the farm, fearing what will become of them, learn that snowball is alive and well, and snowball returns to the farm to encourage capitalism. a second windmill is soon built alongside the first, and the two are thenceforth known as the twin mills (allegorical of the twin towers of the world trade center), and the animals all learn to walk on their hind legs, something hitherto forbidden by old major shortly before the expulsion of mr. jones from the farm. also, in place of the original seven commandments, snowball adopts a single slogan for the animals to live by: all animals are born equal - what they become is their own affair. as time passes, the animals, under the leadership of snowball, realise the properties of monetary gain, and begin to file lawsuits against neighbouring farms, allowing animal farm to gain land and wealth. the revitalised farm also attracts animals from elsewhere in england, who are segregated from the farm animals (a possible allegory for american racial segregation). in an effort to increase their wealth, snowball proposes to transform the farm into a large fairground named animal fair (similar to the development of coney island in the 19th century), and in order to provide power for the fair, the animals drive off a group of beavers and other woodland creatures living by a nearby river, and the beaver-dams are destroyed in order for the farm to exploit the water-supply of the river. despite the success of animal fair, the excessive littering and pollution leaves the farm in a deplorable state, and matters worsen when the twin mills are destroyed by the woodland creatures in retaliation for their expulsion from the riverbank (in a manner similar to the 9/11 attack on the world trade center). snowball counteracts this by declaring war on the now-fanatical woodland creatures, even though animal farm is in no position to win the war.\\n'\n",
" ==================================================== \n",
" #2 - doc# 0 - (31.09918236732483) - Doc:\n",
" b'620\\t\\tanimal farm\\tgeorge orwell\\t1945-08-17\\t{ \"roman \\\\u00e0 clef\", \"satire\", \"children\\'s literature\",\"speculative fiction\", \"fiction\"}\\t old major, the old boar on the manor farm, calls the animals on the farm for a meeting, where he compares the humans to parasites and teaches the animals a revolutionary song, \\'beasts of england\\'. when major dies, two young pigs, snowball and napoleon, assume command and turn his dream into a philosophy. the animals revolt and drive the drunken and irresponsible mr jones from the farm, renaming it \"animal farm\". they adopt seven commandments of animal-ism, the most important of which is, \"all animals are equal\". snowball attempts to teach the animals reading and writing; food is plentiful, and the farm runs smoothly. the pigs elevate themselves to positions of leadership and set aside special food items, ostensibly for their personal health. napoleon takes the pups from the farm dogs and trains them privately. napoleon and snowball struggle for leadership. when snowball announces his plans to build a windmill, napoleon has his dogs chase snowball away and declares himself leader. napoleon enacts changes to the governance structure of the farm, replacing meetings with a committee of pigs, who will run the farm. using a young pig named squealer as a \"mouthpiece\", napoleon claims credit for the windmill idea. the animals work harder with the promise of easier lives with the windmill. after a violent storm, the animals find the windmill annihilated. napoleon and squealer convince the animals that snowball destroyed it, although the scorn of the neighbouring farmers suggests that its walls were too thin. once snowball becomes a scapegoat, napoleon begins purging the farm with his dogs, killing animals he accuses of consorting with his old rival. he and the pigs abuse their power, imposing more control while reserving privileges for themselves and rewriting history, villainising snowball and glorifying napoleon. squealer justifies every statement napoleon makes, even the pigs\\' alteration of the seven commandments of animalism to benefit themselves. \\'beasts of england\\' is replaced by an anthem glorifying napoleon, who appears to be adopting the lifestyle of a man. the animals remain convinced that they are better off than they were when under mr jones. squealer abuses the animals\\' poor memories and invents numbers to show their improvement. mr frederick, one of the neighbouring farmers, attacks the farm, using blasting powder to blow up the restored windmill. though the animals win the battle, they do so at great cost, as many, including boxer the workhorse, are wounded. despite his injuries, boxer continues working harder and harder, until he collapses while working on the windmill. napoleon sends for a van to take boxer to the veterinary surgeon\\'s, explaining that better care can be given there. benjamin, the cynical donkey, who \"could read as well as any pig\", notices that the van belongs to a knacker, and attempts to mount a rescue; but the animals\\' attempts are futile. squealer reports that the van was purchased by the hospital and the writing from the previous owner had not been repainted. he recounts a tale of boxer\\'s death in the hands of the best medical care. years pass, and the pigs learn to walk upright, carry whips and wear clothes. the seven commandments are reduced to a single phrase: \"all animals are equal, but some animals are more equal than others\". napoleon holds a dinner party for the pigs and the humans of the area, who congratulate napoleon on having the hardest-working but least fed animals in the country. napoleon announces an alliance with the humans, against the labouring classes of both \"worlds\". he abolishes practices and traditions related to the revolution, and changes the name of the farm to \"the manor farm\". the animals, overhearing the conversation, notice that the faces of the pigs have begun changing. during a poker match, an argument breaks out between napoleon and mr pilkington, and the animals realise that the faces of the pigs look like the faces of humans, and no one can tell the difference between them. the pigs snowball, napoleon, and squealer adapt old major\\'s ideas into an actual philosophy, which they formally name animalism. soon after, napoleon and squealer indulge in the vices of humans (drinking alcohol, sleeping in beds, trading). squealer is employed to alter the seven commandments to account for this humanisation, an allusion to the soviet government\\'s revising of history in order to exercise control of the people\\'s beliefs about themselves and their society. the original commandments are: # whatever goes upon two legs is an enemy. # whatever goes upon four legs, or has wings, is a friend. # no animal shall wear clothes. # no animal shall sleep in a bed. # no animal shall drink alcohol. # no animal shall kill any other animal. # all animals are equal. later, napoleon and his pigs secretly revise some commandments to clear them of accusations of law-breaking (such as \"no animal shall drink alcohol\" having \"to excess\" appended to it and \"no animal shall sleep in a bed\" with \"with sheets\" added to it). the changed commandments are as follows, with the changes bolded: * 4 no animal shall sleep in a bed with sheets. * 5 no animal shall drink alcohol to excess. * 6 no animal shall kill any other animal without cause. eventually these are replaced with the maxims, \"all animals are equal, but some animals are more equal than others\", and \"four legs good, two legs better!\" as the pigs become more human. this is an ironic twist to the original purpose of the seven commandments, which were supposed to keep order within animal farm by uniting the animals together against the humans, and prevent animals from following the humans\\' evil habits. through the revision of the commandments, orwell demonstrates how simply political dogma can be turned into malleable propaganda.\\n'\n",
" ==================================================== \n",
" #3 - doc# 16262 - (27.7534157037735) - Doc:\n",
" b'32733479\\tx5\\trumours of rain\\t\\t\\t{ \"novel\"}\\t martin, the narrator, a rich south african businessman, recalls the events of a weekend which settled the future of his family farm: he wants to sell it although he has promised his father on his death-bed that he will never do so, and although his brother wants to take it over. so he visits the farm for a weekend to tell his mother she will be \"evicted\". for the trip, he calls off a meeting with his lover bea, who leaves him for that because she wanted to tell him that she is pregnant. it is a long drive to the farm and he does not often go there. he takes his son louis along to get closer to him. since louis has come back from the angolan war of independence, he is traumatized and silent. the imagined good talk between them makes matters worse, though - the father begins by asking his son casually about his \"trip to angola\", and when louis finally opens up and talks about the atrocities of the war, they end up arguing about politics. the farm is extremely drought-stricken, and martin\\'s mother has invited a water diviner to look for an underground stream. martin thinks this is ridiculous. when he then visits the family graveyard for one last time, loses his glasses, so that from then on the egocentric man is in every sense of the word \"blind\\xe2\\x80\\x9d to the events happening around him. at night, a black worker murders his wife and is taken to prison, leaving a baby and older children behind. this is regarded by the whites as \"typical\\xe2\\x80\\x9d tribal behavior and helps martin to underline his opinion that his mother should not run the farm on her own because it is too dangerous - although the real reason is that he will get a good price for the farm. the visit of a few neighbours, who are also farmers, shows that most people in the area are selling their land, which is regarded by those who stay on their family farms as treason. when martin \\xe2\\x80\\x93 still almost blind without his glasses \\xe2\\x80\\x93 gets lost in the jungle on a short walk, almost not making it back to the farm where he was born and raised, it is obvious that he does not belong here. father and son drive home. the farm will be sold and martin\\'s mother will live with their family although she and martin\\'s wife do not get along. louis gets an ultimatum to find a job, after which he disappears and never comes back.\\n'\n",
" ==================================================== \n",
" #4 - doc# 13697 - (26.066428422927856) - Doc:\n",
" b\"21760308\\t8r\\tanimal spirits: how human psychology drives the economy, and why it matters for global capitalism\\trobert j. shiller\\t2009\\t\\t the preface recalls keynes' use of the phrase animal spirits, which he used to describe the psychological forces that partly explain why the economy doesn't behave in the manner predicted by classical economics\\xe2\\x80\\x94a system of thought that expects economic actors to behave as unemotional rational beings. the authors assert that the keynesian revolution was emasculated as keynesians progressively relegated the importance of animal spirits to accommodate the views of economists who preferred the simpler classical or neo-classical system. the preface goes on to describe how keynes' ideas suggest the economy will function best with a moderately high level of government intervention, which they compare to a happy home where children thrive with parents that are neither too authoritarian (as in a marxist economy) nor too permissive (as in a neoliberal economy). the authors state that recent research now supports the concept of animal spirits much more robustly than keynes was able to, and they express the hope that fellow economists can be convinced of this, thus reducing the internecine disputes that prevent their discipline from providing the clear support that politicians need for the aggressive action required to fix the 2008\\xe2\\x80\\x932009 economic crises. the five key animal spirits are treated here, each assigned their own chapter. chapter 1 the authors discuss confidence, which they say is the most important animal spirit to know about if one wishes to understand the economy. chapter 2 is about the desire for fairness, an emotional drive that can cause people to make decisions that aren\\xe2\\x80\\x99t in their economic best interests. chapter 3 discusses corruption and bad faith, and how growing awareness of these practices can contribute to a recession, in addition to the direct harm the practices cause themselves. chapter 4 presents evidence that, in contrast to monetarist theory, many people are at least partially under the money illusion, the tendency for people to ignore the effects of inflation. workers for example will forgo a pay rise even when prices are rising, if they know that their firm is facing challenging conditions\\xe2\\x80\\x94but they are much less willing to accept a pay cut even when prices are falling. chapter 5 is about the importance of stories in determining behaviour. such as the repeatedly told story that house prices will always rise, which caused many additional people to invest in housing following the dot com bust of 2000. here the authors discuss eight important questions about the economy, which they assert can only be satisfactorily answered by a theory that takes animal spirits into account. each question has its own chapter. chapter 6 is about why recessions happen. the authors assert that the business cycle can be explained by rising confidence in the upswing eventually leading investors to make rash decisions and ultimately encouraging corruption, until eventually panic appears and confidence evaporates, triggering a recession. there is a discussion about feedback loops between animal spirits and real returns available, which help explain the intensity of both the up and down swing of the cycle. chapter 7 discusses why animal spirits make central banks a necessity, and there is a post script about how they can intervene to help with the current crises. chapter 8 tackles the reasons for unemployment, which the authors say is partly due to animal spirits such as concerns for fairness and the money illusion. chapter 9 is about why there is a trade off between unemployment and inflation. the authors show how effects of animal spirits refutes the monetarist theory that there is a natural rate of employment which it is not desirable to exceed. chapter 10 is about why people don't consider the future rationally in their decisions about savings. chapter 11 presents an explanation for why asset prices and investment flows are so volatile. chapter 12 discusses why real estate markets go through cycles, with periods of often rapid price increase interspaced by falls. chapter 13 suggests that animal spirits can be used to explain the persistence of poverty among ethnic minorities, describing how working class minorities have different stories about how the world works and their place in it, compared to working class white people. the authors argue that the effects of animal spirits make a strong case for affirmative action. chapter 14 is a conclusion where the authors state that the cumulative evidence they have presented in the preceding chapters overwhelming shows that the neo classical view of the economy, which allows little or no role for animal spirits, is unreliable. they state that an effective response to the current economic crises must take into account the effects of animal spirits.\\n\"\n",
" ==================================================== \n",
" #5 - doc# 1408 - (25.386333465576172) - Doc:\n",
" b'632999\\tt\\tanimal world\\tantonio di benedetto\\t\\t{ \"short story\"}\\t the animal theme is probably the oldest in literature. cavemen told stories of hunts, of talking animals and probably of animal-like gods. the first book of the bible places the serpent in paradise, speaking wisely to the first man and woman. classical authors like lucian and apuleius wrote satires in which pretentious people turned into lowly animals, like a jackass. william shakespeare created his own memorable jackass, and miguel de cervantes had his witty talking dogs. in our century, franz kafka presented the learned address of a highly refined ape to a scientific academy. probably no writer worth his salt has not at one time or other picked up the theme. a little-known, but fascinating contribution to this tradition is [mundo animal, or animal world, by the argentine author, antonio di benedetto. it is strangely different from its celebrated predecessors. kafka, for example, impresses the reader with a striking artistic conception, ingenious logic and magnificent language in each story. di benedetto\\'s stories do not impress in this way. written in conversational and even intentionally awkward language, they present a confused and troubled narrator, who, tormented by mysterious gnawings of guilt, becomes involved in some obscure way with an animal or whole group of animals. they invade his soul, drive him to rage or deliver him from his obsession. often the story hinges on a pun, a distorted folktale, or an illogical association. while not spectacular in itself, each story adds to the preceding to create a growing sense of doom. thus story by story the reader becomes ensnared in a horrifying, hallucinatory realm of associations; the world he thought was human is transformed into animal world.\\n'\n",
" ==================================================== \n"
]
}
],
"source": [
"counter = 1\n",
"for idx, val in sorted_sim[:5]:\n",
" print(\" #\" + str(counter) + \" - doc# \"+ str(idx) + \" - (\" + str(val * 100) + \") - Doc:\")\n",
" print(\" \" + str(documents[idx]))\n",
" print(\" ==================================================== \")\n",
" counter += 1\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1> To continue </h1>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" - index file is 7 GB, fix that\n",
" - add more importance to titles / authors"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment