Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Created October 23, 2014 20:51
Show Gist options
  • Save andreasvc/66fe7547b05569c9a273 to your computer and use it in GitHub Desktop.
Save andreasvc/66fe7547b05569c9a273 to your computer and use it in GitHub Desktop.
Topic Modeling with gensim. Load in ipython notebook or view online: http://nbviewer.ipython.org/gist/andreasvc/66fe7547b05569c9a273
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:d22eba07a2535f6df297c29dc9fb8299158b970c91b44ceed876cb68b8cde502"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Topic Modeling experiments\n",
"--------------------------\n",
"\n",
"This notebook reads a set of plain text files and\n",
"builds topic models with LSI (a.k.a. LSA) and LDA.\n",
"\n",
"The texts are specified in the variable ``TEXTS``\n",
"below.\n",
"\n",
"The models are written to the current\n",
"directory in several files name ``model.*``."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\"\"\"Construct LSI and LDA topic models of texts.\"\"\"\n",
"from __future__ import print_function\n",
"import io, os, re, glob, logging, operator\n",
"from itertools import islice\n",
"from collections import Counter\n",
"import pandas, gensim\n",
"\n",
"# Specify one or more patterns of filenames to use as the corpus.\n",
"# Each file will be treated as a single document.\n",
"TEXTS = [\n",
" 'corpus/Adventure/FOLD1/*.txt',\n",
" 'corpus/Fiction/FOLD1/*.txt',\n",
" ]\n",
"ENCODING = 'utf8' # when this gives issues, try 'latin1'\n",
"\n",
"# A set of words that will be ignored.\n",
"# Either specify a file with one word per line:\n",
"# STOPWORDS = frozenset(\n",
"# open('data/dutch-stop-words.txt').read().splitlines())\n",
"# Or give a space-separated list here:\n",
"STOPWORDS = frozenset(\n",
" 'andere deze over zei zal ge niet als daar moet had wel te toch bij '\n",
" 'niets dan nog maar dat doch geen worden die een dit der en altijd '\n",
" 'haar ze mijn kunnen zonder naar er doen omdat we iemand wezen men '\n",
" 'met ja toen om tegen of kon voor iets hier geweest veel op wie zelf '\n",
" 'wil wij zo zijn ons het heeft van eens tot heb hem wat was door hun '\n",
" 'ook me dus ben zij uw aan hij je werd meer alles reeds af is al ik '\n",
" 'uit want in hoe na zou waren nu de kan mij zich hebben u'.split())\n",
"\n",
"# Tokens must contain at least three letters; no numbers or punctuation\n",
"# (also ignores words with accents)\n",
"TOKENRE = re.compile(r'\\b[-A-Za-z]{3,}\\b')\n",
"# A more inclusive alternative--any sequence of alphanumeric characters:\n",
"# TOKENRE = re.compile(r'\\b\\w+\\b')\n",
"# or, same for non-whitespace characters (includes punctuation):\n",
"# TOKENRE = re.compile(r'\\b\\S+\\b')\n",
"\n",
"logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 58
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\"\"\"Build vector space model.\"\"\"\n",
"filenames = [a for pattern in TEXTS\n",
" for a in glob.glob(pattern)]\n",
"\n",
"# Collect statistics about all tokens;\n",
"# extract lowercased tokens from plain text files.\n",
"dictionary = gensim.corpora.Dictionary(\n",
" TOKENRE.findall(io.open(filename, encoding=ENCODING).read().lower())\n",
" for filename in filenames)\n",
"\n",
"# remove stop words and words that appear only once / too many times\n",
"stop_ids = [dictionary.token2id[stopword]\n",
" for stopword in STOPWORDS\n",
" if stopword in dictionary.token2id]\n",
"dictionary.filter_tokens(stop_ids)\n",
"dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=40000)\n",
"\n",
"# remove gaps in id sequence after words that were removed\n",
"dictionary.compactify()\n",
"dictionary.save('model.dict')\n",
"corpus = (dictionary.doc2bow(\n",
" TOKENRE.findall(io.open(filename, encoding=ENCODING).read().lower()))\n",
" for filename in filenames)\n",
"gensim.corpora.MmCorpus.serialize('model.mm', corpus)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# load corpus\n",
"dictionary = gensim.corpora.Dictionary.load('model.dict')\n",
"corpus = gensim.corpora.MmCorpus('model.mm')\n",
"\n",
"# make topic model with online LDA\n",
"lda = gensim.models.ldamodel.LdaModel(\n",
" corpus=corpus, id2word=dictionary, num_topics=50,\n",
" update_every=0, chunksize=50, passes=10, eval_every=1, alpha='auto')\n",
"lda.save('model.lda')\n",
"\n",
"# apply tf-idf to BOW counts\n",
"tfidf = gensim.models.TfidfModel(corpus)\n",
"corpus = tfidf[corpus]\n",
"\n",
"# transform corpus to LSI space and index it\n",
"lsi = gensim.models.LsiModel(\n",
" corpus, id2word=dictionary, num_topics=50, chunksize=50)\n",
"index = gensim.similarities.Similarity(\n",
" None, lsi[corpus], num_features=lsi.num_terms)\n",
"tfidf.save('model.tfidf')\n",
"lsi.save('model.lsi')\n",
"index.save('model.index')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 60
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"filenames = [a for pattern in TEXTS\n",
" for a in glob.glob(pattern)]\n",
"\n",
"# load models\n",
"dictionary = gensim.corpora.Dictionary.load('model.dict')\n",
"index = gensim.similarities.MatrixSimilarity.load('model.index')\n",
"lsi = gensim.models.LsiModel.load('model.lsi')\n",
"lda = gensim.models.LdaModel.load('model.lda')\n",
"tfidf = gensim.models.TfidfModel.load('model.tfidf')\n",
"corpus = gensim.corpora.MmCorpus('model.mm')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 61
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def summary(vec_bow):\n",
" \"\"\"Helper function that compares a vector in both models.\"\"\"\n",
" doc_lda = lda[vec_bow]\n",
"\n",
" # convert the query to LSI space\n",
" vec_lsi = lsi[tfidf[vec_bow]]\n",
" sims = index[vec_lsi]\n",
" sims = sorted(enumerate(sims),\n",
" key=operator.itemgetter(1), reverse=True)\n",
"\n",
" print('LSI: most similar texts')\n",
" for doc, score in sims[:5]:\n",
" print(score, filenames[doc])\n",
" print()\n",
"\n",
" print('LSI: topics of this text (highest scoring first)')\n",
" for topicno, score in sorted(\n",
" vec_lsi, key=operator.itemgetter(1), reverse=True)[:5]:\n",
" print('%g: %s' % (score, lsi.print_topic(topicno)))\n",
" print()\n",
"\n",
" print('LDA: topics of this text (highest scoring first)')\n",
" for topicno, p in sorted(\n",
" doc_lda, key=operator.itemgetter(1), reverse=True)[:5]:\n",
" print('%g: %s' % (p, lda.print_topic(topicno)))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 62
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print('5 topics from LSI model:')\n",
"for n, line in enumerate(lsi.show_topics(\n",
" num_topics=5, num_words=10, formatted=True)):\n",
" print('Topic #%d: %s' % (n + 1, line))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"5 topics from LSI model:\n",
"Topic #1: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"Topic #2: 0.657*\"sapt\" + 0.401*\"rudolf\" + 0.246*\"rupert\" + 0.243*\"fritz\" + 0.240*\"strelsau\" + 0.177*\"rassendyll\" + 0.153*\"bernenstein\" + 0.152*\"zenda\" + -0.136*\"ivan\" + 0.106*\"flavia\"\n",
"Topic #3: -0.697*\"ivan\" + -0.176*\"roubles\" + 0.131*\"dick\" + -0.124*\"peter\" + -0.115*\"eugene\" + -0.112*\"peasants\" + -0.109*\"peasant\" + 0.107*\"lionel\" + 0.107*\"stella\" + -0.100*\"simeon\"\n",
"Topic #4: -0.304*\"lionel\" + -0.282*\"stella\" + 0.230*\"marquis\" + -0.227*\"macumazahn\" + -0.202*\"denis\" + 0.196*\"monsieur\" + -0.175*\"percy\" + -0.163*\"henry\" + 0.155*\"turner\" + 0.148*\"aline\"\n",
"Topic #5: -0.303*\"stewart\" + -0.275*\"thet\" + 0.220*\"marquis\" + 0.164*\"monsieur\" + 0.159*\"turner\" + -0.156*\"bland\" + -0.148*\"fer\" + -0.146*\"lawson\" + -0.146*\"prim\" + -0.143*\"cowboys\"\n"
]
}
],
"prompt_number": 63
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print('\\ntopics from LDA model:')\n",
"for n, line in enumerate(lda.show_topics(\n",
" num_topics=25, num_words=10, formatted=True)):\n",
" print('Topic #%d: %s' % (n + 1, line))\n",
"print()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"topics from LDA model:\n",
"Topic #1: 0.000*oliver + 0.000*drake + 0.000*lionel + 0.000*billy + 0.000*dick + 0.000*henry + 0.000*harold + 0.000*virginia + 0.000*sapt + 0.000*steve"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Topic #2: 0.000*lionel + 0.000*denis + 0.000*sapt + 0.000*hendricks + 0.000*percy + 0.000*rupert + 0.000*harold + 0.000*rudolf + 0.000*virginia + 0.000*martin\n",
"Topic #3: 0.000*drake + 0.000*steve + 0.000*dick + 0.000*billy + 0.000*lionel + 0.000*martin + 0.000*marcus + 0.000*rudolf + 0.000*ain + 0.000*rupert\n",
"Topic #4: 0.000*stewart + 0.000*rudolf + 0.000*rupert + 0.000*steve + 0.000*lionel + 0.000*majesty + 0.000*sapt + 0.000*florence + 0.000*ain + 0.000*cowboys\n",
"Topic #5: 0.000*drake + 0.000*dick + 0.000*rupert + 0.000*rudolf + 0.000*stewart + 0.000*marcus + 0.000*percy + 0.000*lionel + 0.000*ain + 0.000*colonel\n",
"Topic #6: 0.000*billy + 0.000*steve + 0.000*byrne + 0.000*stewart + 0.000*virginia + 0.000*sapt + 0.000*ain + 0.000*rudolf + 0.000*marcus + 0.000*barbara\n",
"Topic #7: 0.000*steve + 0.000*ralph + 0.000*cliff + 0.000*edna + 0.000*lionel + 0.000*horn + 0.000*drake + 0.000*paris + 0.000*percy + 0.000*france\n",
"Topic #8: 0.000*billy + 0.000*dick + 0.000*drake + 0.000*byrne + 0.000*ain + 0.000*steve + 0.000*monsieur + 0.000*joan + 0.000*barbara + 0.000*stewart\n",
"Topic #9: 0.000*lionel + 0.000*drake + 0.000*stewart + 0.000*denis + 0.000*martin + 0.000*percy + 0.000*dick + 0.000*monsieur + 0.000*indians + 0.000*kitty\n",
"Topic #10: 0.000*billy + 0.000*stewart + 0.000*byrne + 0.000*kid + 0.000*dick + 0.000*ain + 0.000*harding + 0.000*barbara + 0.000*lionel + 0.000*commander\n",
"Topic #11: 0.000*thet + 0.000*hagar + 0.000*bland + 0.000*billy + 0.000*ain + 0.000*buck + 0.000*outlaws + 0.000*lionel + 0.000*reckon + 0.000*outlaw\n",
"Topic #12: 0.001*stewart + 0.000*majesty + 0.000*sapt + 0.000*billy + 0.000*thet + 0.000*rupert + 0.000*cowboys + 0.000*rudolf + 0.000*florence + 0.000*indians\n",
"Topic #13: 0.017*france + 0.017*turner + 0.016*marquis + 0.016*loo + 0.011*monsieur + 0.010*miriam + 0.008*madame + 0.008*pierre + 0.008*lawrence + 0.007*paris\n",
"Topic #14: 0.035*dick + 0.016*button + 0.016*reef + 0.015*lagoon + 0.011*paddy + 0.010*lestrange + 0.008*dinghy + 0.008*coral + 0.005*deck + 0.005*cocoa-nut\n",
"Topic #15: 0.063*billy + 0.029*byrne + 0.017*harding + 0.017*barbara + 0.012*mucker + 0.008*grayson + 0.007*eddie + 0.006*ain + 0.005*ward + 0.005*youse\n",
"Topic #16: 0.026*denis + 0.025*percy + 0.016*hendricks + 0.015*zulus + 0.011*lionel + 0.010*waggon + 0.010*crawford + 0.008*rupert + 0.007*lion + 0.006*farm\n",
"Topic #17: 0.028*commander + 0.011*bishop + 0.008*laura + 0.008*empire + 0.006*colonel + 0.006*aboard + 0.006*peter + 0.006*lordship + 0.006*spanish + 0.005*sword\n",
"Topic #18: 0.026*adam + 0.015*marion + 0.014*robin + 0.013*allan + 0.013*hagar + 0.006*adelaide + 0.005*baron + 0.003*castle + 0.003*carleton + 0.003*corn\n",
"Topic #19: 0.020*marcus + 0.014*dentist + 0.014*harold + 0.013*virginia + 0.010*maria + 0.007*mac + 0.006*dollars + 0.006*ain + 0.005*baker + 0.003*joe\n",
"Topic #20: 0.119*oliver + 0.011*suzanne + 0.011*emma + 0.008*verdi + 0.006*myron + 0.005*beach + 0.004*hospital + 0.004*guy + 0.004*richard + 0.004*someone\n",
"Topic #21: 0.009*hamilton + 0.005*leary + 0.004*paris + 0.004*jane + 0.004*guy + 0.003*harry + 0.003*colonel + 0.003*tom + 0.002*regiment + 0.002*lordship\n",
"Topic #22: 0.048*martin + 0.031*barney + 0.010*hermit + 0.006*indians + 0.005*brazil + 0.005*rattler + 0.005*bob + 0.004*savages + 0.004*aunt + 0.004*canoe\n",
"Topic #23: 0.042*drake + 0.021*dick + 0.010*falconer + 0.009*vernon + 0.007*henry + 0.006*earl + 0.006*mills + 0.006*countess + 0.005*thou + 0.005*mamma\n",
"Topic #24: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess\n",
"Topic #25: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery\n",
"\n"
]
}
],
"prompt_number": 64
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# show similarities / topics of 20 texts the models were based on\n",
"for n, vec_bow in enumerate(islice(corpus, 20)):\n",
" print('Text:', filenames[n])\n",
" print(io.open(filenames[n], encoding=ENCODING).read(500), '[...]')\n",
" print('-' * 50)\n",
" summary(vec_bow)\n",
" print('=' * 50, '\\n\\n')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Text: corpus/Adventure/FOLD1/18857.txt\n",
"Project Gutenberg's A Journey to the Centre of the Earth, by Jules Verne\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: A Journey to the Centre of the Earth\n",
"\n",
"Author: Jules Verne\n",
"\n",
"Release Date: July 18, 2006 [EBook #18857]\n",
"Last updated: December 27, 2012\n",
"\n",
"Language: English\n",
"\n",
"Character set [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/18857.txt\n",
"0.0665643 corpus/Adventure/FOLD1/2727.txt\n",
"0.0665143 corpus/Fiction/FOLD1/5240.txt\n",
"0.0619348 corpus/Adventure/FOLD1/21459.txt\n",
"0.0573205 corpus/Adventure/FOLD1/2166.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.478681: 0.378*\"hans\" + 0.342*\"marshall\" + -0.213*\"oliver\" + 0.184*\"laura\" + 0.181*\"livingstone\" + 0.179*\"senator\" + 0.167*\"consul\" + -0.156*\"bishop\" + 0.151*\"commander\" + -0.132*\"lordship\"\n",
"0.366615: 0.282*\"hans\" + -0.228*\"joan\" + 0.224*\"adam\" + 0.210*\"kirkland\" + -0.206*\"bart\" + -0.201*\"willy\" + -0.174*\"thee\" + 0.159*\"marion\" + -0.157*\"marshall\" + -0.153*\"kate\"\n",
"0.200951: 0.679*\"oliver\" + -0.383*\"hagar\" + -0.238*\"adam\" + 0.159*\"hans\" + -0.158*\"hamilton\" + 0.136*\"emma\" + -0.132*\"robin\" + -0.124*\"marcus\" + -0.111*\"dentist\" + -0.101*\"thee\"\n",
"0.198607: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.190409: -0.566*\"hamilton\" + 0.292*\"marcus\" + 0.266*\"dentist\" + -0.214*\"willy\" + -0.176*\"oliver\" + -0.174*\"adam\" + 0.144*\"hans\" + 0.139*\"maria\" + 0.127*\"mac\" + -0.113*\"camel\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.99927: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/393.txt\n",
"\ufeffThe Project Gutenberg EBook of The Blue Lagoon, by H. de Vere Stacpoole\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.net\n",
"\n",
"\n",
"Title: The Blue Lagoon\n",
" A Romance\n",
"\n",
"Author: H. de Vere Stacpoole\n",
"\n",
"Release Date: January 19, 2008 [EBook #393]\n",
"\n",
"Language: English\n",
"\n",
"\n",
"*** START OF THIS PROJECT GUTENBERG EBO [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts\n",
"1.0 corpus/Adventure/FOLD1/393.txt\n",
"0.158348 corpus/Fiction/FOLD1/22961.txt\n",
"0.136583 corpus/Adventure/FOLD1/21459.txt\n",
"0.0699755 corpus/Fiction/FOLD1/525.txt\n",
"0.0324906 corpus/Adventure/FOLD1/1965.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.271972: 0.271*\"drake\" + 0.251*\"dick\" + 0.212*\"marquis\" + -0.168*\"marshall\" + 0.160*\"turner\" + 0.150*\"kitty\" + -0.136*\"willy\" + 0.134*\"steve\" + 0.127*\"stella\" + -0.124*\"lionel\"\n",
"0.193956: -0.697*\"ivan\" + -0.176*\"roubles\" + 0.131*\"dick\" + -0.124*\"peter\" + -0.115*\"eugene\" + -0.112*\"peasants\" + -0.109*\"peasant\" + 0.107*\"lionel\" + 0.107*\"stella\" + -0.100*\"simeon\""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"0.182906: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.176242: -0.398*\"drake\" + -0.166*\"hong\" + -0.163*\"kong\" + 0.159*\"laura\" + -0.155*\"bombay\" + -0.155*\"thee\" + -0.150*\"billy\" + 0.140*\"prim\" + -0.124*\"francis\" + 0.123*\"burton\"\n",
"0.112625: 0.364*\"kitty\" + -0.337*\"steve\" + 0.238*\"thee\" + 0.229*\"grandma\" + -0.213*\"drake\" + 0.197*\"marion\" + -0.154*\"macumazahn\" + -0.137*\"laura\" + 0.128*\"lionel\" + -0.121*\"emma\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.999863: 0.035*dick + 0.016*button + 0.016*reef + 0.015*lagoon + 0.011*paddy + 0.010*lestrange + 0.008*dinghy + 0.008*coral + 0.005*deck + 0.005*cocoa-nut\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/24695.txt\n",
"The Project Gutenberg EBook of The Snowshoe Trail, by Edison Marshall\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.net\n",
"\n",
"\n",
"Title: The Snowshoe Trail\n",
"\n",
"Author: Edison Marshall\n",
"\n",
"Posting Date: March 8, 2009 [EBook #24695]\n",
"Release Date: February 26, 2008\n",
"\n",
"Language: English\n",
"\n",
"Character set encoding: ASCII\n",
"\n",
" [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/24695.txt\n",
"0.0496626 corpus/Adventure/FOLD1/21459.txt\n",
"0.0240968 corpus/Fiction/FOLD1/1897.txt\n",
"0.0235749 corpus/Adventure/FOLD1/23662.txt\n",
"0.0173788 corpus/Fiction/FOLD1/5240.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.443732: 0.325*\"harold\" + 0.277*\"virginia\" + -0.277*\"hamilton\" + -0.201*\"joan\" + -0.191*\"marcus\" + -0.190*\"edna\" + -0.181*\"bart\" + -0.174*\"dentist\" + 0.155*\"commander\" + -0.131*\"kate\"\n",
"0.247424: 0.267*\"billy\" + 0.219*\"byrne\" + 0.193*\"harold\" + -0.179*\"joan\" + 0.162*\"virginia\" + -0.161*\"bart\" + -0.155*\"willy\" + 0.152*\"edna\" + 0.146*\"hans\" + 0.141*\"kitty\"\n",
"0.232661: -0.411*\"steve\" + 0.246*\"kitty\" + 0.246*\"drake\" + -0.225*\"marion\" + -0.197*\"billy\" + 0.184*\"prim\" + 0.181*\"harold\" + 0.165*\"stella\" + 0.165*\"burton\" + 0.156*\"grandma\"\n",
"0.164141: 0.515*\"steve\" + 0.374*\"kitty\" + 0.237*\"grandma\" + -0.199*\"laura\" + -0.175*\"thee\" + -0.164*\"drake\" + -0.150*\"stewart\" + -0.126*\"emma\" + -0.118*\"hagar\" + -0.114*\"dick\"\n",
"0.163801: 0.279*\"marcus\" + 0.254*\"dentist\" + 0.209*\"marshall\" + -0.199*\"hans\" + -0.189*\"hamilton\" + 0.183*\"oliver\" + 0.177*\"martin\" + -0.163*\"prim\" + -0.156*\"steve\" + 0.154*\"kirkland\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.981579: 0.020*marcus + 0.014*dentist + 0.014*harold + 0.013*virginia + 0.010*maria + 0.007*mac + 0.006*dollars + 0.006*ain + 0.005*baker + 0.003*joe\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/95.txt\n",
"\ufeffThe Project Gutenberg EBook of The Prisoner of Zenda, by Anthony Hope\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: The Prisoner of Zenda\n",
"\n",
"Author: Anthony Hope\n",
"\n",
"Release Date: January 10, 2006 [EBook #95]\n",
"[This file last updated October 6, 2010]\n",
"\n",
"Language: English\n",
"\n",
"\n",
"*** START OF THIS PR [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts\n",
"1.0 corpus/Adventure/FOLD1/95.txt\n",
"0.677422 corpus/Adventure/FOLD1/1145.txt\n",
"0.0266588 corpus/Fiction/FOLD1/5240.txt\n",
"0.0255197 corpus/Adventure/FOLD1/21393.txt\n",
"0.0176788 corpus/Adventure/FOLD1/10368.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.870599: 0.657*\"sapt\" + 0.401*\"rudolf\" + 0.246*\"rupert\" + 0.243*\"fritz\" + 0.240*\"strelsau\" + 0.177*\"rassendyll\" + 0.153*\"bernenstein\" + 0.152*\"zenda\" + -0.136*\"ivan\" + 0.106*\"flavia\""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"0.401136: -0.600*\"rudolf\" + -0.349*\"bernenstein\" + 0.337*\"fritz\" + -0.279*\"rassendyll\" + 0.263*\"sapt\" + 0.200*\"flavia\" + 0.163*\"duke\" + 0.131*\"johann\" + 0.130*\"zenda\" + -0.125*\"constable\"\n",
"0.27622: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.0376122: -0.592*\"lionel\" + -0.274*\"denis\" + -0.234*\"percy\" + 0.197*\"stella\" + 0.171*\"macumazahn\" + -0.171*\"hendricks\" + 0.166*\"henry\" + 0.156*\"thee\" + -0.142*\"moore\" + -0.133*\"dick\"\n",
"0.0161147: -0.304*\"lionel\" + -0.282*\"stella\" + 0.230*\"marquis\" + -0.227*\"macumazahn\" + -0.202*\"denis\" + 0.196*\"monsieur\" + -0.175*\"percy\" + -0.163*\"henry\" + 0.155*\"turner\" + 0.148*\"aline\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.995377: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess\n",
"================================================== \n",
"\n",
"\n",
"Text: corpus/Adventure/FOLD1/10368.txt\n",
"The Project Gutenberg EBook of The Vizier of the Two-Horned Alexander\n",
"by Frank R. Stockton\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.net\n",
"\n",
"\n",
"Title: The Vizier of the Two-Horned Alexander\n",
"\n",
"Author: Frank R. Stockton\n",
"\n",
"Release Date: December 2, 2003 [EBook #10368]\n",
"\n",
"Language: English\n",
"\n",
"\n",
"*** START OF THI [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/10368.txt\n",
"0.146627 corpus/Adventure/FOLD1/2166.txt\n",
"0.112932 corpus/Fiction/FOLD1/243.txt\n",
"0.0710879 corpus/Fiction/FOLD1/15181.txt\n",
"0.0575988 corpus/Fiction/FOLD1/5240.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.393863: 0.364*\"kitty\" + -0.337*\"steve\" + 0.238*\"thee\" + 0.229*\"grandma\" + -0.213*\"drake\" + 0.197*\"marion\" + -0.154*\"macumazahn\" + -0.137*\"laura\" + 0.128*\"lionel\" + -0.121*\"emma\"\n",
"0.294297: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.25737: -0.592*\"lionel\" + -0.274*\"denis\" + -0.234*\"percy\" + 0.197*\"stella\" + 0.171*\"macumazahn\" + -0.171*\"hendricks\" + 0.166*\"henry\" + 0.156*\"thee\" + -0.142*\"moore\" + -0.133*\"dick\"\n",
"0.150428: -0.279*\"martin\" + -0.225*\"barney\" + -0.214*\"commander\" + 0.213*\"willy\" + -0.184*\"bishop\" + -0.160*\"lordship\" + 0.151*\"marcus\" + -0.143*\"hagar\" + 0.140*\"kirkland\" + 0.138*\"dentist\"\n",
"0.144327: -0.234*\"leary\" + 0.233*\"denis\" + 0.204*\"percy\" + -0.202*\"lionel\" + 0.197*\"bishop\" + 0.179*\"stewart\" + 0.162*\"hendricks\" + -0.145*\"henry\" + -0.141*\"jane\" + -0.118*\"harry\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.727034: 0.013*thee + 0.010*steamer + 0.007*detective + 0.007*hong + 0.006*kong + 0.006*bombay + 0.006*passengers + 0.006*francis + 0.004*yokohama + 0.004*india\n",
"0.14788: 0.042*drake + 0.021*dick + 0.010*falconer + 0.009*vernon + 0.007*henry + 0.006*earl + 0.006*mills + 0.006*countess + 0.005*thou + 0.005*mamma\n",
"0.0518649: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery\n",
"0.0479786: 0.029*edna + 0.026*cliff + 0.025*horn + 0.025*ralph + 0.012*mound + 0.012*burke + 0.012*shirley + 0.011*bags + 0.009*cave + 0.009*banker"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"0.0116218: 0.054*marshall + 0.028*senator + 0.025*consul + 0.022*livingstone + 0.016*admiral + 0.012*porto + 0.009*president + 0.009*wireless + 0.008*las + 0.007*henry\n",
"================================================== \n",
"\n",
"\n",
"Text: corpus/Adventure/FOLD1/21459.txt\n",
"The Project Gutenberg EBook of Dick Onslow, by W.H.G. Kingston\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: Dick Onslow\n",
" Among the Redskins\n",
"\n",
"Author: W.H.G. Kingston\n",
"\n",
"Illustrator: George Soper\n",
"\n",
"Release Date: May 15, 2007 [EBook #21459]\n",
"\n",
"Language: English\n",
"\n",
"Character set encoding: A [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/21459.txt\n",
"0.136583 corpus/Adventure/FOLD1/393.txt\n",
"0.101856 corpus/Fiction/FOLD1/5240.txt\n",
"0.0715007 corpus/Adventure/FOLD1/2166.txt\n",
"0.0619348 corpus/Adventure/FOLD1/18857.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.377979: -0.238*\"steve\" + -0.218*\"drake\" + -0.189*\"hans\" + 0.189*\"billy\" + -0.165*\"prim\" + 0.148*\"byrne\" + -0.147*\"burton\" + 0.144*\"kitty\" + -0.135*\"marion\" + -0.134*\"kid\"\n",
"0.278085: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.237038: 0.365*\"drake\" + -0.276*\"lagoon\" + -0.195*\"dick\" + -0.193*\"paddy\" + -0.192*\"lestrange\" + 0.153*\"leary\" + 0.145*\"stewart\" + -0.143*\"bishop\" + -0.140*\"reef\" + 0.112*\"marshall\"\n",
"0.216762: -0.697*\"ivan\" + -0.176*\"roubles\" + 0.131*\"dick\" + -0.124*\"peter\" + -0.115*\"eugene\" + -0.112*\"peasants\" + -0.109*\"peasant\" + 0.107*\"lionel\" + 0.107*\"stella\" + -0.100*\"simeon\"\n",
"0.205202: -0.398*\"drake\" + -0.166*\"hong\" + -0.163*\"kong\" + 0.159*\"laura\" + -0.155*\"bombay\" + -0.155*\"thee\" + -0.150*\"billy\" + 0.140*\"prim\" + -0.124*\"francis\" + 0.123*\"burton\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.984055: 0.014*indians + 0.013*rifle + 0.009*tent + 0.006*delaware + 0.005*wolves + 0.005*dick + 0.005*sam + 0.004*pole + 0.004*rifles + 0.004*wagon\n",
"0.0157949: 0.003*skulls + 0.003*flayed + 0.002*rabbits + 0.002*ramrod + 0.002*brag + 0.002*shrouded + 0.002*affording + 0.002*horde + 0.002*seventies + 0.002*equivalent\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/23662.txt\n",
"The Project Gutenberg EBook of The Heart of Unaga, by Ridgwell Cullum\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: The Heart of Unaga\n",
"\n",
"Author: Ridgwell Cullum\n",
"\n",
"Release Date: November 30, 2007 [EBook #23662]\n",
"Last Updated: January 14, 2009\n",
"\n",
"Language: English\n",
"\n",
"Character set encoding: ASCI [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/23662.txt\n",
"0.0921901 corpus/Adventure/FOLD1/15072.txt\n",
"0.0491475 corpus/Adventure/FOLD1/21459.txt\n",
"0.0277324 corpus/Fiction/FOLD1/363.txt\n",
"0.0238107 corpus/Fiction/FOLD1/5240.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.551729: 0.515*\"steve\" + 0.374*\"kitty\" + 0.237*\"grandma\" + -0.199*\"laura\" + -0.175*\"thee\" + -0.164*\"drake\" + -0.150*\"stewart\" + -0.126*\"emma\" + -0.118*\"hagar\" + -0.114*\"dick\"\n",
"0.139929: 0.271*\"drake\" + 0.251*\"dick\" + 0.212*\"marquis\" + -0.168*\"marshall\" + 0.160*\"turner\" + 0.150*\"kitty\" + -0.136*\"willy\" + 0.134*\"steve\" + 0.127*\"stella\" + -0.124*\"lionel\"\n",
"0.113773: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.0908025: 0.325*\"harold\" + 0.277*\"virginia\" + -0.277*\"hamilton\" + -0.201*\"joan\" + -0.191*\"marcus\" + -0.190*\"edna\" + -0.181*\"bart\" + -0.174*\"dentist\" + 0.155*\"commander\" + -0.131*\"kate\"\n",
"0.0846385: -0.697*\"ivan\" + -0.176*\"roubles\" + 0.131*\"dick\" + -0.124*\"peter\" + -0.115*\"eugene\" + -0.112*\"peasants\" + -0.109*\"peasant\" + 0.107*\"lionel\" + 0.107*\"stella\" + -0.100*\"simeon\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.920782: 0.055*steve + 0.008*indians + 0.006*fort + 0.006*seal + 0.005*darn + 0.005*outfit + 0.005*ross + 0.005*folks + 0.005*feller + 0.005*squaw\n",
"0.0790382: 0.048*martin + 0.031*barney + 0.010*hermit + 0.006*indians + 0.005*brazil + 0.005*rattler + 0.005*bob + 0.004*savages + 0.004*aunt + 0.004*canoe\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/1965.txt\n",
"\ufeffThe Project Gutenberg EBook of Captain Blood, by Rafael Sabatini\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: Captain Blood\n",
"\n",
"Author: Rafael Sabatini\n",
"\n",
"Posting Date: September 26, 2008 [EBook #1965]\n",
"Release Date: November, 1999\n",
"\n",
"Language: English\n",
"\n",
"\n",
"*** START OF THIS PROJECT GUTENBERG EB [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/1965.txt\n",
"0.184564 corpus/Fiction/FOLD1/5240.txt\n",
"0.130528 corpus/Fiction/FOLD1/243.txt\n",
"0.0799009 corpus/Adventure/FOLD1/1947.txt\n",
"0.0713367 corpus/Fiction/FOLD1/1762.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.368183: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.33753: -0.234*\"leary\" + 0.233*\"denis\" + 0.204*\"percy\" + -0.202*\"lionel\" + 0.197*\"bishop\" + 0.179*\"stewart\" + 0.162*\"hendricks\" + -0.145*\"henry\" + -0.141*\"jane\" + -0.118*\"harry\"\n",
"0.226162: 0.371*\"lionel\" + -0.327*\"denis\" + -0.242*\"percy\" + 0.215*\"henry\" + -0.197*\"hendricks\" + -0.145*\"leary\" + 0.137*\"bishop\" + 0.133*\"moore\" + 0.126*\"maurice\" + -0.126*\"lagoon\"\n",
"0.198903: -0.304*\"lionel\" + -0.282*\"stella\" + 0.230*\"marquis\" + -0.227*\"macumazahn\" + -0.202*\"denis\" + 0.196*\"monsieur\" + -0.175*\"percy\" + -0.163*\"henry\" + 0.155*\"turner\" + 0.148*\"aline\"\n",
"0.191072: 0.284*\"marshall\" + -0.266*\"martin\" + -0.257*\"commander\" + 0.212*\"billy\" + -0.208*\"barney\" + 0.173*\"marion\" + 0.172*\"kirkland\" + 0.165*\"byrne\" + 0.163*\"drake\" + 0.152*\"livingstone\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.769824: 0.011*joan + 0.010*bart + 0.010*bishop + 0.009*kate + 0.007*satan + 0.007*colonel + 0.006*lordship + 0.006*buck + 0.006*haines + 0.005*sheriff\n",
"0.223622: 0.028*commander + 0.011*bishop + 0.008*laura + 0.008*empire + 0.006*colonel + 0.006*aboard + 0.006*peter + 0.006*lordship + 0.006*spanish + 0.005*sword\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/13290.txt\n",
"The Project Gutenberg EBook of Martin Rattler, by Robert Michael Ballantyne\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.net\n",
"\n",
"\n",
"Title: Martin Rattler\n",
"\n",
"Author: Robert Michael Ballantyne\n",
"\n",
"Release Date: August 25, 2004 [EBook #13290]\n",
"\n",
"Language: English\n",
"\n",
"\n",
"*** START OF THIS PROJECT GUTENBERG EBOOK MARTIN [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts\n",
"1.0 corpus/Adventure/FOLD1/13290.txt\n",
"0.0441942 corpus/Adventure/FOLD1/24091.txt\n",
"0.0437165 corpus/Fiction/FOLD1/5240.txt\n",
"0.0340096 corpus/Adventure/FOLD1/21459.txt\n",
"0.0157667 corpus/Adventure/FOLD1/1965.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.599935: -0.478*\"commander\" + 0.431*\"martin\" + 0.386*\"barney\" + -0.190*\"edna\" + -0.157*\"empire\" + -0.147*\"kirkland\" + -0.137*\"ralph\" + -0.124*\"viceroy\" + 0.105*\"marshall\" + -0.101*\"steve\""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"0.304325: 0.309*\"billy\" + -0.262*\"hamilton\" + 0.235*\"byrne\" + 0.235*\"marion\" + 0.234*\"commander\" + 0.233*\"martin\" + -0.206*\"marshall\" + 0.182*\"barney\" + 0.164*\"allan\" + 0.157*\"hagar\"\n",
"0.256847: -0.457*\"harold\" + -0.387*\"virginia\" + 0.221*\"commander\" + 0.201*\"martin\" + 0.194*\"kitty\" + -0.192*\"hans\" + -0.160*\"hagar\" + 0.154*\"barney\" + -0.154*\"marion\" + -0.148*\"joan\"\n",
"0.234655: 0.279*\"marcus\" + 0.254*\"dentist\" + 0.209*\"marshall\" + -0.199*\"hans\" + -0.189*\"hamilton\" + 0.183*\"oliver\" + 0.177*\"martin\" + -0.163*\"prim\" + -0.156*\"steve\" + 0.154*\"kirkland\"\n",
"0.151521: 0.342*\"edna\" + 0.255*\"ralph\" + -0.248*\"commander\" + -0.202*\"hamilton\" + -0.180*\"marcus\" + 0.171*\"kirkland\" + -0.170*\"marion\" + -0.163*\"dentist\" + 0.160*\"hagar\" + -0.159*\"joan\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.997661: 0.048*martin + 0.031*barney + 0.010*hermit + 0.006*indians + 0.005*brazil + 0.005*rattler + 0.005*bob + 0.004*savages + 0.004*aunt + 0.004*canoe\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/12190.txt\n",
"Project Gutenberg's The Adventures of Captain Horn, by Frank Richard Stockton\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.net\n",
"\n",
"\n",
"Title: The Adventures of Captain Horn\n",
"\n",
"Author: Frank Richard Stockton\n",
"\n",
"Release Date: April 29, 2004 [EBook #12190]\n",
"\n",
"Language: English\n",
"\n",
"Character set encoding: ASCII\n",
"\n",
"*** [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts\n",
"1.0 corpus/Adventure/FOLD1/12190.txt\n",
"0.0553897 corpus/Adventure/FOLD1/103.txt\n",
"0.0436207 corpus/Fiction/FOLD1/5240.txt\n",
"0.0396242 corpus/Adventure/FOLD1/18399.txt\n",
"0.0360836 corpus/Fiction/FOLD1/525.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.517421: 0.342*\"edna\" + 0.255*\"ralph\" + -0.248*\"commander\" + -0.202*\"hamilton\" + -0.180*\"marcus\" + 0.171*\"kirkland\" + -0.170*\"marion\" + -0.163*\"dentist\" + 0.160*\"hagar\" + -0.159*\"joan\""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"0.272126: -0.537*\"willy\" + 0.213*\"joan\" + 0.193*\"bart\" + 0.182*\"edna\" + -0.170*\"hongkong\" + 0.160*\"ralph\" + 0.144*\"kate\" + -0.143*\"helmsman\" + -0.139*\"chinamen\" + 0.118*\"hong\"\n",
"0.246315: -0.332*\"kirkland\" + 0.256*\"prim\" + 0.226*\"burton\" + -0.225*\"mortimer\" + 0.193*\"kid\" + -0.192*\"hans\" + 0.191*\"marcus\" + 0.174*\"dentist\" + 0.164*\"edna\" + -0.147*\"joan\"\n",
"0.23826: 0.288*\"marion\" + 0.221*\"willy\" + 0.221*\"hagar\" + -0.203*\"billy\" + 0.202*\"allan\" + -0.182*\"kirkland\" + -0.157*\"commander\" + -0.153*\"byrne\" + -0.148*\"adam\" + 0.147*\"edna\"\n",
"0.216545: 0.267*\"billy\" + 0.219*\"byrne\" + 0.193*\"harold\" + -0.179*\"joan\" + 0.162*\"virginia\" + -0.161*\"bart\" + -0.155*\"willy\" + 0.152*\"edna\" + 0.146*\"hans\" + 0.141*\"kitty\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.996734: 0.029*edna + 0.026*cliff + 0.025*horn + 0.025*ralph + 0.012*mound + 0.012*burke + 0.012*shirley + 0.011*bags + 0.009*cave + 0.009*banker\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/21393.txt\n",
"The Project Gutenberg EBook of Hendricks the Hunter, by W.H.G. Kingston\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: Hendricks the Hunter\n",
" The Border Farm, a Tale of Zululand\n",
"\n",
"Author: W.H.G. Kingston\n",
"\n",
"Release Date: May 8, 2007 [EBook #21393]\n",
"\n",
"Language: English\n",
"\n",
"Character set enco [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts\n",
"1.0 corpus/Adventure/FOLD1/21393.txt\n",
"0.232275 corpus/Fiction/FOLD1/16217.txt\n",
"0.087352 corpus/Adventure/FOLD1/2727.txt\n",
"0.055515 corpus/Adventure/FOLD1/2166.txt\n",
"0.038622 corpus/Fiction/FOLD1/5240.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.332482: -0.234*\"leary\" + 0.233*\"denis\" + 0.204*\"percy\" + -0.202*\"lionel\" + 0.197*\"bishop\" + 0.179*\"stewart\" + 0.162*\"hendricks\" + -0.145*\"henry\" + -0.141*\"jane\" + -0.118*\"harry\"\n",
"0.209731: -0.432*\"dick\" + -0.347*\"drake\" + -0.207*\"lagoon\" + 0.185*\"stewart\" + 0.167*\"lionel\" + 0.154*\"thet\" + 0.147*\"marquis\" + -0.146*\"paddy\" + -0.127*\"lestrange\" + -0.115*\"reef\"\n",
"0.166803: -0.329*\"stella\" + 0.297*\"henry\" + -0.292*\"turner\" + 0.252*\"aline\" + 0.170*\"andre\" + -0.161*\"miriam\" + 0.147*\"tour\" + 0.146*\"thou\" + -0.142*\"baboons\" + 0.142*\"kitty\""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"0.166365: -0.697*\"ivan\" + -0.176*\"roubles\" + 0.131*\"dick\" + -0.124*\"peter\" + -0.115*\"eugene\" + -0.112*\"peasants\" + -0.109*\"peasant\" + 0.107*\"lionel\" + 0.107*\"stella\" + -0.100*\"simeon\"\n",
"0.152659: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.999759: 0.026*denis + 0.025*percy + 0.016*hendricks + 0.015*zulus + 0.011*lionel + 0.010*waggon + 0.010*crawford + 0.008*rupert + 0.007*lion + 0.006*farm\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/103.txt\n",
"\ufeffThe Project Gutenberg EBook of Around the World in 80 Days, by Jules Verne\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.net\n",
"\n",
"\n",
"Title: Around the World in 80 Days\n",
"\n",
"Author: Jules Verne\n",
"\n",
"Release Date: May 15, 2008 [EBook #103]\n",
"Last updated: February 18, 2012\n",
"Last updated: May 5, 2012\n",
"\n",
"Language: Englis [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/103.txt\n",
"0.0975255 corpus/Fiction/FOLD1/5240.txt\n",
"0.0947979 corpus/Fiction/FOLD1/525.txt\n",
"0.0724676 corpus/Adventure/FOLD1/1947.txt\n",
"0.0593174 corpus/Adventure/FOLD1/1965.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.345129: -0.537*\"willy\" + 0.213*\"joan\" + 0.193*\"bart\" + 0.182*\"edna\" + -0.170*\"hongkong\" + 0.160*\"ralph\" + 0.144*\"kate\" + -0.143*\"helmsman\" + -0.139*\"chinamen\" + 0.118*\"hong\"\n",
"0.314399: -0.238*\"steve\" + -0.218*\"drake\" + -0.189*\"hans\" + 0.189*\"billy\" + -0.165*\"prim\" + 0.148*\"byrne\" + -0.147*\"burton\" + 0.144*\"kitty\" + -0.135*\"marion\" + -0.134*\"kid\"\n",
"0.274888: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.220694: 0.325*\"harold\" + 0.277*\"virginia\" + -0.277*\"hamilton\" + -0.201*\"joan\" + -0.191*\"marcus\" + -0.190*\"edna\" + -0.181*\"bart\" + -0.174*\"dentist\" + 0.155*\"commander\" + -0.131*\"kate\"\n",
"0.190412: -0.304*\"lionel\" + -0.282*\"stella\" + 0.230*\"marquis\" + -0.227*\"macumazahn\" + -0.202*\"denis\" + 0.196*\"monsieur\" + -0.175*\"percy\" + -0.163*\"henry\" + 0.155*\"turner\" + 0.148*\"aline\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.80322: 0.013*thee + 0.010*steamer + 0.007*detective + 0.007*hong + 0.006*kong + 0.006*bombay + 0.006*passengers + 0.006*francis + 0.004*yokohama + 0.004*india\n",
"0.0552566: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery\n",
"0.0481754: 0.007*deck + 0.007*steamer + 0.006*boats + 0.005*marcus + 0.004*coal + 0.004*dollars + 0.004*dentist + 0.004*cargo + 0.003*skipper + 0.003*aft\n",
"0.0355613: 0.042*drake + 0.021*dick + 0.010*falconer + 0.009*vernon + 0.007*henry + 0.006*earl + 0.006*mills + 0.006*countess + 0.005*thou + 0.005*mamma"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"0.0252097: 0.038*kid + 0.026*prim + 0.016*burton + 0.009*car + 0.009*charlie + 0.009*ain + 0.008*dopey + 0.007*detective + 0.007*beppo + 0.007*pilot\n",
"================================================== \n",
"\n",
"\n",
"Text: corpus/Adventure/FOLD1/15072.txt\n",
"The Project Gutenberg eBook, Marjorie's Maytime, by Carolyn Wells\n",
"\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.net\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"Title: Marjorie's Maytime\n",
"\n",
"Author: Carolyn Wells\n",
"\n",
"Release Date: February 15, 2005 [eBook #15072]\n",
"\n",
"Language: English\n",
"\n",
"Character set encoding: ISO-646-US (US-ASCII)\n",
"\n",
"\n",
"***START OF [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/15072.txt\n",
"0.0970604 corpus/Adventure/FOLD1/2727.txt\n",
"0.0921901 corpus/Adventure/FOLD1/23662.txt\n",
"0.0231356 corpus/Fiction/FOLD1/1897.txt\n",
"0.0213549 corpus/Adventure/FOLD1/21459.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.543896: 0.515*\"steve\" + 0.374*\"kitty\" + 0.237*\"grandma\" + -0.199*\"laura\" + -0.175*\"thee\" + -0.164*\"drake\" + -0.150*\"stewart\" + -0.126*\"emma\" + -0.118*\"hagar\" + -0.114*\"dick\"\n",
"0.419478: 0.364*\"kitty\" + -0.337*\"steve\" + 0.238*\"thee\" + 0.229*\"grandma\" + -0.213*\"drake\" + 0.197*\"marion\" + -0.154*\"macumazahn\" + -0.137*\"laura\" + 0.128*\"lionel\" + -0.121*\"emma\"\n",
"0.302999: -0.411*\"steve\" + 0.246*\"kitty\" + 0.246*\"drake\" + -0.225*\"marion\" + -0.197*\"billy\" + 0.184*\"prim\" + 0.181*\"harold\" + 0.165*\"stella\" + 0.165*\"burton\" + 0.156*\"grandma\"\n",
"0.261049: -0.457*\"harold\" + -0.387*\"virginia\" + 0.221*\"commander\" + 0.201*\"martin\" + 0.194*\"kitty\" + -0.192*\"hans\" + -0.160*\"hagar\" + 0.154*\"barney\" + -0.154*\"marion\" + -0.148*\"joan\"\n",
"0.224654: 0.271*\"drake\" + 0.251*\"dick\" + 0.212*\"marquis\" + -0.168*\"marshall\" + 0.160*\"turner\" + 0.150*\"kitty\" + -0.136*\"willy\" + 0.134*\"steve\" + 0.127*\"stella\" + -0.124*\"lionel\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.973059: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess\n",
"0.0149565: 0.014*indians + 0.013*rifle + 0.009*tent + 0.006*delaware + 0.005*wolves + 0.005*dick + 0.005*sam + 0.004*pole + 0.004*rifles + 0.004*wagon\n",
"0.0117751: 0.119*oliver + 0.011*suzanne + 0.011*emma + 0.008*verdi + 0.006*myron + 0.005*beach + 0.004*hospital + 0.004*guy + 0.004*richard + 0.004*someone\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/18399.txt\n",
"The Project Gutenberg eBook, The Shipwreck, by Joseph Spillman, Translated\n",
"by Mary Richards Gray\n",
"\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"Title: The Shipwreck\n",
" A Story for the Young\n",
"\n",
"\n",
"Author: Joseph Spillman\n",
"\n",
"\n",
"\n",
"Release Date: May 16, 2006 [eBook #18399]\n",
"\n",
"Language: English\n",
"\n",
"Chara [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts\n",
"1.0 corpus/Adventure/FOLD1/18399.txt\n",
"0.0484134 corpus/Fiction/FOLD1/525.txt\n",
"0.0396242 corpus/Adventure/FOLD1/12190.txt\n",
"0.0321986 corpus/Adventure/FOLD1/393.txt\n",
"0.0255324 corpus/Adventure/FOLD1/103.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.283363: 0.288*\"marion\" + 0.221*\"willy\" + 0.221*\"hagar\" + -0.203*\"billy\" + 0.202*\"allan\" + -0.182*\"kirkland\" + -0.157*\"commander\" + -0.153*\"byrne\" + -0.148*\"adam\" + 0.147*\"edna\"\n",
"0.276646: -0.279*\"martin\" + -0.225*\"barney\" + -0.214*\"commander\" + 0.213*\"willy\" + -0.184*\"bishop\" + -0.160*\"lordship\" + 0.151*\"marcus\" + -0.143*\"hagar\" + 0.140*\"kirkland\" + 0.138*\"dentist\"\n",
"0.137868: 0.309*\"billy\" + -0.262*\"hamilton\" + 0.235*\"byrne\" + 0.235*\"marion\" + 0.234*\"commander\" + 0.233*\"martin\" + -0.206*\"marshall\" + 0.182*\"barney\" + 0.164*\"allan\" + 0.157*\"hagar\"\n",
"0.126233: -0.361*\"laura\" + -0.321*\"oliver\" + -0.297*\"hagar\" + -0.266*\"emma\" + -0.248*\"steve\" + -0.187*\"mamma\" + -0.163*\"marion\" + -0.147*\"governess\" + -0.146*\"kitty\" + -0.129*\"abbey\"\n",
"0.111108: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.527788: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery\n",
"0.368154: 0.015*willy + 0.008*peter + 0.007*roubles + 0.006*ivan + 0.006*peasant + 0.006*holy + 0.006*peasants + 0.006*priest + 0.005*cell + 0.005*eugene\n",
"0.100204: 0.035*dick + 0.016*button + 0.016*reef + 0.015*lagoon + 0.011*paddy + 0.010*lestrange + 0.008*dinghy + 0.008*coral + 0.005*deck + 0.005*cocoa-nut\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/1145.txt\n",
"\ufeffThe Project Gutenberg EBook of Rupert of Hentzau, by Anthony Hope\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: Rupert of Hentzau\n",
" From The Memoirs of Fritz Von Tarlenheim: The Sequel to\n",
" The Prisoner of Zenda\n",
"\n",
"Author: Anthony Hope\n",
"\n",
"Posting Date: August 3, 2008 [EBook #1145 [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts\n",
"1.0 corpus/Adventure/FOLD1/1145.txt\n",
"0.677422 corpus/Adventure/FOLD1/95.txt\n",
"0.0341073 corpus/Adventure/FOLD1/21393.txt\n",
"0.0176074 corpus/Fiction/FOLD1/5240.txt\n",
"0.0173433 corpus/Adventure/FOLD1/1965.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.874067: 0.657*\"sapt\" + 0.401*\"rudolf\" + 0.246*\"rupert\" + 0.243*\"fritz\" + 0.240*\"strelsau\" + 0.177*\"rassendyll\" + 0.153*\"bernenstein\" + 0.152*\"zenda\" + -0.136*\"ivan\" + 0.106*\"flavia\"\n",
"0.265833: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.0289141: -0.592*\"lionel\" + -0.274*\"denis\" + -0.234*\"percy\" + 0.197*\"stella\" + 0.171*\"macumazahn\" + -0.171*\"hendricks\" + 0.166*\"henry\" + 0.156*\"thee\" + -0.142*\"moore\" + -0.133*\"dick\""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"0.0128141: 0.371*\"lionel\" + -0.327*\"denis\" + -0.242*\"percy\" + 0.215*\"henry\" + -0.197*\"hendricks\" + -0.145*\"leary\" + 0.137*\"bishop\" + 0.133*\"moore\" + 0.126*\"maurice\" + -0.126*\"lagoon\"\n",
"0.0102128: -0.361*\"laura\" + -0.321*\"oliver\" + -0.297*\"hagar\" + -0.266*\"emma\" + -0.248*\"steve\" + -0.187*\"mamma\" + -0.163*\"marion\" + -0.147*\"governess\" + -0.146*\"kitty\" + -0.129*\"abbey\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.9054: 0.059*rudolf + 0.047*sapt + 0.040*rupert + 0.027*bernenstein + 0.026*rassendyll + 0.017*strelsau + 0.015*james + 0.013*constable + 0.010*hentzau + 0.009*fritz\n",
"0.0667251: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess\n",
"0.0234222: 0.013*kirkland + 0.009*mortimer + 0.008*farm + 0.007*egypt + 0.006*lone + 0.006*messengers + 0.004*cable + 0.004*nile + 0.004*pond + 0.004*terrace\n",
"================================================== \n",
"\n",
"\n",
"Text: corpus/Adventure/FOLD1/24091.txt\n",
"The Project Gutenberg EBook of Despoilers of the Golden Empire, by \n",
"Gordon Randall Garrett\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: Despoilers of the Golden Empire\n",
"\n",
"Author: Gordon Randall Garrett\n",
"\n",
"Illustrator: Kelly Freas\n",
"\n",
"Release Date: December 31, 2007 [EBook #24091]\n",
"\n",
"Language: E [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/24091.txt\n",
"0.0441942 corpus/Adventure/FOLD1/13290.txt\n",
"0.0326124 corpus/Adventure/FOLD1/10368.txt\n",
"0.0295066 corpus/Adventure/FOLD1/1965.txt\n",
"0.0269335 corpus/Adventure/FOLD1/21459.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.297073: 0.309*\"billy\" + -0.262*\"hamilton\" + 0.235*\"byrne\" + 0.235*\"marion\" + 0.234*\"commander\" + 0.233*\"martin\" + -0.206*\"marshall\" + 0.182*\"barney\" + 0.164*\"allan\" + 0.157*\"hagar\"\n",
"0.277774: -0.457*\"harold\" + -0.387*\"virginia\" + 0.221*\"commander\" + 0.201*\"martin\" + 0.194*\"kitty\" + -0.192*\"hans\" + -0.160*\"hagar\" + 0.154*\"barney\" + -0.154*\"marion\" + -0.148*\"joan\"\n",
"0.188337: 0.378*\"hans\" + 0.342*\"marshall\" + -0.213*\"oliver\" + 0.184*\"laura\" + 0.181*\"livingstone\" + 0.179*\"senator\" + 0.167*\"consul\" + -0.156*\"bishop\" + 0.151*\"commander\" + -0.132*\"lordship\"\n",
"0.188018: 0.325*\"harold\" + 0.277*\"virginia\" + -0.277*\"hamilton\" + -0.201*\"joan\" + -0.191*\"marcus\" + -0.190*\"edna\" + -0.181*\"bart\" + -0.174*\"dentist\" + 0.155*\"commander\" + -0.131*\"kate\"\n",
"0.117855: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.98727: 0.028*commander + 0.011*bishop + 0.008*laura + 0.008*empire + 0.006*colonel + 0.006*aboard + 0.006*peter + 0.006*lordship + 0.006*spanish + 0.005*sword\n",
"0.0101011: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/8493.txt\n",
"The Project Gutenberg EBook of The Last Hope, by Henry Seton Merriman\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: The Last Hope\n",
"\n",
"Author: Henry Seton Merriman\n",
"\n",
"Release Date: July, 2005 [EBook #8493]\n",
"Posting Date: July 27, 2009\n",
"\n",
"Language: English\n",
"\n",
"Character set encoding: ASCII\n",
"\n",
"*** STA [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/8493.txt\n",
"0.273577 corpus/Adventure/FOLD1/1947.txt\n",
"0.0887184 corpus/Fiction/FOLD1/5240.txt\n",
"0.0610417 corpus/Adventure/FOLD1/2166.txt\n",
"0.0448207 corpus/Adventure/FOLD1/103.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.38461: -0.303*\"stewart\" + -0.275*\"thet\" + 0.220*\"marquis\" + 0.164*\"monsieur\" + 0.159*\"turner\" + -0.156*\"bland\" + -0.148*\"fer\" + -0.146*\"lawson\" + -0.146*\"prim\" + -0.143*\"cowboys\"\n",
"0.380421: -0.304*\"lionel\" + -0.282*\"stella\" + 0.230*\"marquis\" + -0.227*\"macumazahn\" + -0.202*\"denis\" + 0.196*\"monsieur\" + -0.175*\"percy\" + -0.163*\"henry\" + 0.155*\"turner\" + 0.148*\"aline\"\n",
"0.34475: 0.271*\"drake\" + 0.251*\"dick\" + 0.212*\"marquis\" + -0.168*\"marshall\" + 0.160*\"turner\" + 0.150*\"kitty\" + -0.136*\"willy\" + 0.134*\"steve\" + 0.127*\"stella\" + -0.124*\"lionel\"\n",
"0.277414: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.243673: -0.432*\"dick\" + -0.347*\"drake\" + -0.207*\"lagoon\" + 0.185*\"stewart\" + 0.167*\"lionel\" + 0.154*\"thet\" + 0.147*\"marquis\" + -0.146*\"paddy\" + -0.127*\"lestrange\" + -0.115*\"reef\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.998851: 0.017*france + 0.017*turner + 0.016*marquis + 0.016*loo + 0.011*monsieur + 0.010*miriam + 0.008*madame + 0.008*pierre + 0.008*lawrence + 0.007*paris\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/2166.txt\n",
"\ufeffThe Project Gutenberg EBook of King Solomon's Mines, by H. Rider Haggard\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.net\n",
"\n",
"\n",
"Title: King Solomon's Mines\n",
"\n",
"Author: H. Rider Haggard\n",
"\n",
"Posting Date: January 15, 2009 [EBook #2166]\n",
"Release Date: October 11, 2005\n",
"Last updated: August 18, 2011\n",
"Last updated: [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/2166.txt\n",
"0.20818 corpus/Adventure/FOLD1/2727.txt\n",
"0.146627 corpus/Adventure/FOLD1/10368.txt\n",
"0.0715007 corpus/Adventure/FOLD1/21459.txt\n",
"0.064237 corpus/Fiction/FOLD1/5240.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.374357: -0.329*\"stella\" + 0.297*\"henry\" + -0.292*\"turner\" + 0.252*\"aline\" + 0.170*\"andre\" + -0.161*\"miriam\" + 0.147*\"tour\" + 0.146*\"thou\" + -0.142*\"baboons\" + 0.142*\"kitty\"\n",
"0.347208: -0.592*\"lionel\" + -0.274*\"denis\" + -0.234*\"percy\" + 0.197*\"stella\" + 0.171*\"macumazahn\" + -0.171*\"hendricks\" + 0.166*\"henry\" + 0.156*\"thee\" + -0.142*\"moore\" + -0.133*\"dick\"\n",
"0.30836: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.299665: 0.371*\"lionel\" + -0.327*\"denis\" + -0.242*\"percy\" + 0.215*\"henry\" + -0.197*\"hendricks\" + -0.145*\"leary\" + 0.137*\"bishop\" + 0.133*\"moore\" + 0.126*\"maurice\" + -0.126*\"lagoon\"\n",
"0.210767: -0.361*\"laura\" + -0.321*\"oliver\" + -0.297*\"hagar\" + -0.266*\"emma\" + -0.248*\"steve\" + -0.187*\"mamma\" + -0.163*\"marion\" + -0.147*\"governess\" + -0.146*\"kitty\" + -0.129*\"abbey\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.551928: 0.026*denis + 0.025*percy + 0.016*hendricks + 0.015*zulus + 0.011*lionel + 0.010*waggon + 0.010*crawford + 0.008*rupert + 0.007*lion + 0.006*farm\n",
"0.447945: 0.042*drake + 0.021*dick + 0.010*falconer + 0.009*vernon + 0.007*henry + 0.006*earl + 0.006*mills + 0.006*countess + 0.005*thou + 0.005*mamma\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/2727.txt\n",
"\ufeffThe Project Gutenberg EBook of Allan's Wife, by H. Rider Haggard\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: Allan's Wife\n",
"\n",
"Author: H. Rider Haggard\n",
"\n",
"Release Date: March 28, 2006 [EBook #2727]\n",
"\n",
"Language: English\n",
"\n",
"\n",
"*** START OF THIS PROJECT GUTENBERG EBOOK ALLAN'S WIFE ***\n",
"\n",
"\n",
"\n",
"\n",
"Produced [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts\n",
"1.0 corpus/Adventure/FOLD1/2727.txt\n",
"0.20818 corpus/Adventure/FOLD1/2166.txt\n",
"0.0970604 corpus/Adventure/FOLD1/15072.txt\n",
"0.087352 corpus/Adventure/FOLD1/21393.txt\n",
"0.0835832 corpus/Fiction/FOLD1/15182.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.330356: -0.293*\"turner\" + 0.290*\"aline\" + 0.276*\"stella\" + -0.248*\"loo\" + -0.205*\"denis\" + 0.196*\"andre\" + -0.171*\"henry\" + 0.167*\"lionel\" + 0.162*\"tour\" + -0.159*\"percy\"\n",
"0.316777: -0.592*\"lionel\" + -0.274*\"denis\" + -0.234*\"percy\" + 0.197*\"stella\" + 0.171*\"macumazahn\" + -0.171*\"hendricks\" + 0.166*\"henry\" + 0.156*\"thee\" + -0.142*\"moore\" + -0.133*\"dick\"\n",
"0.207412: -0.697*\"ivan\" + -0.176*\"roubles\" + 0.131*\"dick\" + -0.124*\"peter\" + -0.115*\"eugene\" + -0.112*\"peasants\" + -0.109*\"peasant\" + 0.107*\"lionel\" + 0.107*\"stella\" + -0.100*\"simeon\"\n",
"0.18471: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"0.182816: 0.284*\"marshall\" + -0.266*\"martin\" + -0.257*\"commander\" + 0.212*\"billy\" + -0.208*\"barney\" + 0.173*\"marion\" + 0.172*\"kirkland\" + 0.165*\"byrne\" + 0.163*\"drake\" + 0.152*\"livingstone\""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.528421: 0.026*denis + 0.025*percy + 0.016*hendricks + 0.015*zulus + 0.011*lionel + 0.010*waggon + 0.010*crawford + 0.008*rupert + 0.007*lion + 0.006*farm\n",
"0.468855: 0.052*stella + 0.028*macumazahn + 0.023*baboons + 0.015*carson + 0.013*allan + 0.013*waggons + 0.011*kraals + 0.010*baboon + 0.008*peak + 0.008*marble\n",
"================================================== \n",
"\n",
"\n",
"Text:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" corpus/Adventure/FOLD1/1947.txt\n",
"\ufeffThe Project Gutenberg EBook of Scaramouche, by Rafael Sabatini\n",
"\n",
"This eBook is for the use of anyone anywhere at no cost and with\n",
"almost no restrictions whatsoever. You may copy it, give it away or\n",
"re-use it under the terms of the Project Gutenberg License included\n",
"with this eBook or online at www.gutenberg.org\n",
"\n",
"\n",
"Title: Scaramouche\n",
" A Romance of the French Revolution\n",
"\n",
"Author: Rafael Sabatini\n",
"\n",
"Release Date: November, 1999 [Etext #1947]\n",
"Posting Date: August 13, 2009\n",
"\n",
"Language: English\n",
"\n",
"\n",
"** [...]\n",
"--------------------------------------------------\n",
"LSI: most similar texts"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1.0 corpus/Adventure/FOLD1/1947.txt\n",
"0.273577 corpus/Adventure/FOLD1/8493.txt\n",
"0.129544 corpus/Fiction/FOLD1/5240.txt\n",
"0.0799009 corpus/Adventure/FOLD1/1965.txt\n",
"0.0724676 corpus/Adventure/FOLD1/103.txt\n",
"\n",
"LSI: topics of this text (highest scoring first)\n",
"0.462423: -0.293*\"turner\" + 0.290*\"aline\" + 0.276*\"stella\" + -0.248*\"loo\" + -0.205*\"denis\" + 0.196*\"andre\" + -0.171*\"henry\" + 0.167*\"lionel\" + 0.162*\"tour\" + -0.159*\"percy\"\n",
"0.417472: -0.304*\"lionel\" + -0.282*\"stella\" + 0.230*\"marquis\" + -0.227*\"macumazahn\" + -0.202*\"denis\" + 0.196*\"monsieur\" + -0.175*\"percy\" + -0.163*\"henry\" + 0.155*\"turner\" + 0.148*\"aline\"\n",
"0.383253: -0.329*\"stella\" + 0.297*\"henry\" + -0.292*\"turner\" + 0.252*\"aline\" + 0.170*\"andre\" + -0.161*\"miriam\" + 0.147*\"tour\" + 0.146*\"thou\" + -0.142*\"baboons\" + 0.142*\"kitty\"\n",
"0.367033: -0.303*\"stewart\" + -0.275*\"thet\" + 0.220*\"marquis\" + 0.164*\"monsieur\" + 0.159*\"turner\" + -0.156*\"bland\" + -0.148*\"fer\" + -0.146*\"lawson\" + -0.146*\"prim\" + -0.143*\"cowboys\"\n",
"0.333359: 0.319*\"ivan\" + 0.182*\"sapt\" + 0.149*\"peter\" + 0.133*\"monsieur\" + 0.123*\"marquis\" + 0.122*\"thee\" + 0.115*\"colonel\" + 0.113*\"bishop\" + 0.112*\"leary\" + 0.109*\"rudolf\"\n",
"\n",
"LDA: topics of this text (highest scoring first)\n",
"0.94682: 0.014*tour + 0.012*monsieur + 0.011*marquis + 0.010*aline + 0.009*madame + 0.007*andre + 0.006*paris + 0.005*nantes + 0.005*mademoiselle + 0.005*philippe\n",
"0.0102775: 0.009*hamilton + 0.005*leary + 0.004*paris + 0.004*jane + 0.004*guy + 0.003*harry + 0.003*colonel + 0.003*tom + 0.002*regiment + 0.002*lordship\n",
"================================================== \n",
"\n",
"\n"
]
}
],
"prompt_number": 65
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment