Skip to content

Instantly share code, notes, and snippets.

@davebshow
Last active June 1, 2020 15:12
Show Gist options
  • Save davebshow/ff6918f036c028da19c9 to your computer and use it in GitHub Desktop.
Save davebshow/ff6918f036c028da19c9 to your computer and use it in GitHub Desktop.
Using Gensim to make some similarity queries.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Use Gensim for document similarity queries."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import os\n",
"from gensim import corpora, models, similarities\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import RegexpTokenizer"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Code and example based on Gensim tutorials: http://radimrehurek.com/gensim/tutorial.html"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def similarity_matrix(docs, model=None):\n",
" start_time = datetime.datetime.now()\n",
" dim = len(docs)\n",
" matrix = np.zeros((dim, dim))\n",
" for i, doc in enumerate(docs):\n",
" train_set = docs[:i] + docs[i + 1:]\n",
" index = model(train_set)\n",
" sims = index.similarity(doc)\n",
" for doc_id, sim in sims:\n",
" if doc_id >= i:\n",
" doc_id += 1\n",
" matrix[i, doc_id] = sim\n",
" end_time = datetime.datetime.now()\n",
" print('Training started: {0}'.format(start_time))\n",
" print('Training complete: {0}'.format(end_time))\n",
" print('Time spent training: {0}'.format(end_time - start_time))\n",
" return matrix\n",
"\n",
"\n",
"# Classes/helpers for topic modelling\n",
"class BaseModel(object):\n",
" \"\"\"Base TFIDF model. Take a corpus of documents, clean, and\n",
" then create a dictionary, MmCorpus. Implements similarity query method.\"\"\"\n",
" def __init__(self, documents, directory='models', filename='output'):\n",
" if not os.path.exists(directory):\n",
" os.makedirs(directory)\n",
" self.dict_path = '{0}/{1}.dict'.format(directory, filename)\n",
" self.corpus_path = '{0}/{1}.mm'.format(directory, filename)\n",
" texts = process_documents(documents)\n",
" dictionary = corpora.Dictionary(texts)\n",
" dictionary.save(self.dict_path)\n",
" bows = [dictionary.doc2bow(text) for text in texts]\n",
" corpora.MmCorpus.serialize(self.corpus_path, bows)\n",
" \n",
" def similarity(self, document):\n",
" dictionary = corpora.Dictionary.load(self.dict_path)\n",
" vec_bow = dictionary.doc2bow(document.lower().split())\n",
" vec_lsi = self.model[vec_bow]\n",
" sims = self.index[vec_lsi]\n",
" return sorted(list(enumerate(sims)), key=lambda x: x[1], reverse=True)\n",
" \n",
" def transform(self):\n",
" corpus = corpora.MmCorpus(self.corpus_path)\n",
" return self.model[corpus_tfidf]\n",
" \n",
" \n",
"class TFIDFModel(BaseModel): \n",
" \"\"\"TFIDModel.\"\"\" \n",
" def __init__(self, documents, directory='models', filename='output'):\n",
" super(TFIDFModel, self).__init__(documents, directory='models', filename='output')\n",
" self.tfidf_path = '{0}/{1}.model'.format(directory, filename) \n",
" corpus = corpora.MmCorpus(self.corpus_path) \n",
" self.model = models.TfidfModel(corpus)\n",
" self.index = similarities.MatrixSimilarity(self.model[corpus])\n",
"\n",
"\n",
"class LSIModel(BaseModel):\n",
" \"\"\"LSI model.\"\"\"\n",
" def __init__(self, documents, directory='models', filename='output', num_topics=2):\n",
" super(LSIModel, self).__init__(documents, directory='models', filename='output')\n",
" dictionary = corpora.Dictionary.load(self.dict_path)\n",
" corpus = corpora.MmCorpus(self.corpus_path)\n",
" self.model = models.LsiModel(corpus, id2word=dictionary, \n",
" num_topics=num_topics)\n",
" self.index = similarities.MatrixSimilarity(self.model[corpus])\n",
" \n",
" \n",
"class LDAModel(BaseModel):\n",
" \"\"\"LDA Model.\"\"\"\n",
" def __init__(self, documents, directory='models', filename='output', num_topics=2):\n",
" super(LDAModel, self).__init__(documents, directory='models', filename='output')\n",
" dictionary = corpora.Dictionary.load(self.dict_path)\n",
" corpus = corpora.MmCorpus(self.corpus_path)\n",
" self.model = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)\n",
" self.index = similarities.MatrixSimilarity(self.model[corpus])\n",
" \n",
"\n",
"class LSITFIDFModel(TFIDFModel):\n",
" \"\"\"TFIDFLS model wrapped with LSI model.\"\"\"\n",
" def __init__(self, documents, directory='models', filename='output', num_topics=2):\n",
" super(LSITFIDFModel, self).__init__(documents, directory='models', filename='output')\n",
" dictionary = corpora.Dictionary.load(self.dict_path)\n",
" corpus = corpora.MmCorpus(self.corpus_path)\n",
" corpus_tfidf = self.model[corpus]\n",
" self.model = models.LsiModel(corpus_tfidf, id2word=dictionary, \n",
" num_topics=num_topics)\n",
" self.index = similarities.MatrixSimilarity(self.model[corpus_tfidf])\n",
" \n",
" def transform(self):\n",
" corpus = corpora.MmCorpus(self.corpus_path)\n",
" tdidf = models.TfidfModel.load(self.tfidf_path)\n",
" corpus_tfidf = tdidf[corpus]\n",
" return self.model[corpus_tfidf]\n",
" \n",
" \n",
"def process_documents(documents):\n",
" \"\"\"Remove stopwords, tokenize by document, use TextBlob to clean\n",
" punctuation, then remove words that only occur once in the corpus.\"\"\"\n",
" stpwrds = stopwords.words('english')\n",
" tokenizer = RegexpTokenizer(r'\\w+')\n",
" texts = [[w for w in tokenizer.tokenize(doc.lower()) if w not in stpwrds] \n",
" for doc in documents]\n",
" all_tokens = sum(texts, [])\n",
" tokens_once = set(\n",
" w for w in set(all_tokens) if all_tokens.count(w) == 1\n",
" )\n",
" return [[w for w in text if w not in tokens_once] \n",
" for text in texts]\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"This is an example corpus of sentence documents."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"documents = [\"Human machine interface for lab abc computer applications\",\n",
" \"A survey of user opinion of computer system response time\",\n",
" \"The EPS user interface management system\",\n",
" \"System and human system engineering testing of EPS\",\n",
" \"Relation of user perceived response time to error measurement\",\n",
" \"The generation of random binary unordered trees\",\n",
" \"The intersection graph of paths in trees\",\n",
" \"Graph minors IV Widths of trees and well quasi ordering\",\n",
" \"Graph minors A survey\"]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Create an LSIModel and a TFIDFModel/LSIModel chain."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"lsi = LSIModel(documents)\n",
"lsi_tfidf = LSITFIDFModel(documents)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Execute a similarity query against the indexed documents."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"doc = \"Human computer interaction\"\n",
"sims = lsi.similarity(doc)\n",
"t_sims = lsi_tfidf.similarity(doc)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Results for each document in the corpus, sorted highest similarity to lowest."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for i, sim in sims:\n",
" print((i,sim), documents[i])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"((2, 0.99844527), 'The EPS user interface management system')\n",
"((0, 0.99809301), 'Human machine interface for lab abc computer applications')\n",
"((3, 0.9865886), 'System and human system engineering testing of EPS')\n",
"((1, 0.93748635), 'A survey of user opinion of computer system response time')\n",
"((4, 0.90755945), 'Relation of user perceived response time to error measurement')\n",
"((8, 0.050041765), 'Graph minors A survey')\n",
"((7, -0.098794639), 'Graph minors IV Widths of trees and well quasi ordering')\n",
"((6, -0.10639259), 'The intersection graph of paths in trees')\n",
"((5, -0.12416792), 'The generation of random binary unordered trees')\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for i, sim in t_sims:\n",
" print((i,sim), documents[i])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"((0, 0.99994081), 'Human machine interface for lab abc computer applications')\n",
"((2, 0.99990785), 'The EPS user interface management system')\n",
"((3, 0.99984384), 'System and human system engineering testing of EPS')\n",
"((4, 0.9992786), 'Relation of user perceived response time to error measurement')\n",
"((1, 0.99330217), 'A survey of user opinion of computer system response time')\n",
"((8, 0.22248439), 'Graph minors A survey')\n",
"((7, -0.016480923), 'Graph minors IV Widths of trees and well quasi ordering')\n",
"((6, -0.0515742), 'The intersection graph of paths in trees')\n",
"((5, -0.08804217), 'The generation of random binary unordered trees')\n"
]
}
],
"prompt_number": 8
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment