Skip to content

Instantly share code, notes, and snippets.

@persiyanov
Created March 20, 2018 20:23
Show Gist options
  • Save persiyanov/5aed5165d7945c176a0f557a473ef848 to your computer and use it in GitHub Desktop.
Save persiyanov/5aed5165d7945c176a0f557a473ef848 to your computer and use it in GitHub Desktop.
python_memmap.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Corpus in the Matrix Market format.
This code uses python's struct library to read/write binary data
"""
import logging
import numpy as np
from scipy.sparse import csc_matrix
logger = logging.getLogger(__name__)
class MemmapReaderArray(object):
"""Matrix market file reader, used for :class:`~gensim.corpora.mmcorpus.MmCorpus`.
Wrap a term-document matrix on disk (in matrix-market format), and present it
as an object which supports iteration over the rows (~documents).
Attributes
----------
num_docs : int
number of documents in market matrix file
num_terms : int
number of terms
num_nnz : int
number of non-zero terms
Notes
----------
Note that the file is read into memory one document at a time, not the whole matrix at once
(unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM.
"""
def __init__(self, input, transposed=True):
"""
Parameters
----------
input : {str, file-like object}
Path to input file in MM format or a file-like object that supports `seek()`
(e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`).
transposed : bool, optional
if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value.
"""
logger.info("initializing corpus reader from %s", input)
self.input, self.transposed = input, transposed
self.num_docs, self.num_terms, self.num_nnz = self.read_headers()
logger.info(
"accepted corpus with %i documents, %i features, %i non-zero entries",
self.num_docs, self.num_terms, self.num_nnz
)
def __len__(self):
"""Get size of corpus (number of documents)."""
return self.num_docs
def __str__(self):
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
(self.num_docs, self.num_terms, self.num_nnz))
def read_headers(self):
"""Reader header row for file metadata
Returns
----------
num_docs : int
num_terms : int
num_nnz : int
"""
meta_fp = np.memmap(self.input, dtype='int32', mode='r', shape=(3,))
num_docs, num_terms, num_nnz = meta_fp[:]
return num_docs, num_terms, num_nnz
@staticmethod
def construct_csc(corpus):
# matrix term-document (then, csc.indptr[i]:csc.indptr[i+1] will be a slice for i-th document)
data, row_ind, col_ind = [], [], []
for (doc_id, doc) in enumerate(corpus):
for (termid, value) in doc:
data.append(value)
row_ind.append(termid)
col_ind.append(doc_id)
return csc_matrix((data, (row_ind, col_ind)), dtype='float32')
@staticmethod
def save_corpus(fname, corpus):
logger.info("storing corpus in memmap Matrix Market format to %s", fname)
csc = MemmapReaderArray.construct_csc(corpus)
num_terms, num_docs = csc.shape
num_nnz = csc.nnz
logger.info(
"storing corpus with %i documents, %i features, %i non-zero entries",
num_docs, num_terms, num_nnz
)
# write out header info
meta_fp = np.memmap(fname, dtype='int32', mode='w+', shape=(3,))
meta_fp[:] = [num_docs, num_terms, num_nnz]
del meta_fp # this forces flush()
data_fp = np.memmap(fname+'.data', dtype='float32', mode='w+', shape=csc.data.shape)
data_fp[:] = csc.data[:]
del data_fp
indices_fp = np.memmap(fname+'.indices', dtype='int32', mode='w+', shape=csc.indices.shape)
indices_fp[:] = csc.indices[:]
del indices_fp
indptr_fp = np.memmap(fname+'.indptr', dtype='int32', mode='w+', shape=csc.indptr.shape)
indptr_fp[:] = csc.indptr[:]
del indptr_fp
def __iter__(self):
"""Iterate through corpus.
Notes
------
Note that the total number of vectors returned is always equal to the number of rows specified
in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly
stored in the Matrix Market file.
Yields
------
(int, list of (int, number))
Document id and Document in BoW format
"""
data_fp = np.memmap(self.input + '.data', dtype='float32', mode='r')
indices_fp = np.memmap(self.input + '.indices', dtype='int32', mode='r')
indptr_fp = np.memmap(self.input + '.indptr', dtype='int32', mode='r')
assert self.num_docs == indptr_fp.shape[0] - 1
for i in range(self.num_docs):
yield zip(indices_fp[indptr_fp[i]:indptr_fp[i+1]], data_fp[indptr_fp[i]:indptr_fp[i+1]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment