persiyanov/python_memmap.py

## python_memmap.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""Corpus in the Matrix Market format.

This code uses python's struct library to read/write binary data

"""

import logging
import numpy as np
from scipy.sparse import csc_matrix

logger = logging.getLogger(__name__)


class MemmapReaderArray(object):
    """Matrix market file reader, used for :class:`~gensim.corpora.mmcorpus.MmCorpus`.

    Wrap a term-document matrix on disk (in matrix-market format), and present it
    as an object which supports iteration over the rows (~documents).

    Attributes
    ----------
    num_docs : int
        number of documents in market matrix file
    num_terms : int
        number of terms
    num_nnz : int
        number of non-zero terms

    Notes
    ----------
    Note that the file is read into memory one document at a time, not the whole matrix at once
    (unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM.

    """

    def __init__(self, input, transposed=True):
        """

        Parameters
        ----------
        input : {str, file-like object}
            Path to input file in MM format or a file-like object that supports `seek()`
            (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`).

        transposed : bool, optional
            if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value.

        """
        logger.info("initializing corpus reader from %s", input)
        self.input, self.transposed = input, transposed
        self.num_docs, self.num_terms, self.num_nnz = self.read_headers()

        logger.info(
            "accepted corpus with %i documents, %i features, %i non-zero entries",
            self.num_docs, self.num_terms, self.num_nnz
        )

    def __len__(self):
        """Get size of corpus (number of documents)."""
        return self.num_docs

    def __str__(self):
        return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
                (self.num_docs, self.num_terms, self.num_nnz))

    def read_headers(self):
        """Reader header row for file metadata

        Returns
        ----------
        num_docs : int
        num_terms : int
        num_nnz : int

        """
        meta_fp = np.memmap(self.input, dtype='int32', mode='r', shape=(3,))
        num_docs, num_terms, num_nnz = meta_fp[:]
        return num_docs, num_terms, num_nnz

    @staticmethod
    def construct_csc(corpus):
        # matrix term-document (then, csc.indptr[i]:csc.indptr[i+1] will be a slice for i-th document)
        data, row_ind, col_ind = [], [], []
        for (doc_id, doc) in enumerate(corpus):
            for (termid, value) in doc:
                data.append(value)
                row_ind.append(termid)
                col_ind.append(doc_id)
        return csc_matrix((data, (row_ind, col_ind)), dtype='float32')

    @staticmethod
    def save_corpus(fname, corpus):
        logger.info("storing corpus in memmap Matrix Market format to %s", fname)

        csc = MemmapReaderArray.construct_csc(corpus)

        num_terms, num_docs = csc.shape
        num_nnz = csc.nnz

        logger.info(
            "storing corpus with %i documents, %i features, %i non-zero entries",
            num_docs, num_terms, num_nnz
        )

        # write out header info
        meta_fp = np.memmap(fname, dtype='int32', mode='w+', shape=(3,))
        meta_fp[:] = [num_docs, num_terms, num_nnz]
        del meta_fp  # this forces flush()

        data_fp = np.memmap(fname+'.data', dtype='float32', mode='w+', shape=csc.data.shape)
        data_fp[:] = csc.data[:]
        del data_fp

        indices_fp = np.memmap(fname+'.indices', dtype='int32', mode='w+', shape=csc.indices.shape)
        indices_fp[:] = csc.indices[:]
        del indices_fp

        indptr_fp = np.memmap(fname+'.indptr', dtype='int32', mode='w+', shape=csc.indptr.shape)
        indptr_fp[:] = csc.indptr[:]
        del indptr_fp

    def __iter__(self):
        """Iterate through corpus.

        Notes
        ------
        Note that the total number of vectors returned is always equal to the number of rows specified
        in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly
        stored in the Matrix Market file.

        Yields
        ------
        (int, list of (int, number))
            Document id and Document in BoW format

        """
        data_fp = np.memmap(self.input + '.data', dtype='float32', mode='r')
        indices_fp = np.memmap(self.input + '.indices', dtype='int32', mode='r')
        indptr_fp = np.memmap(self.input + '.indptr', dtype='int32', mode='r')

        assert self.num_docs == indptr_fp.shape[0] - 1
        for i in range(self.num_docs):
            yield zip(indices_fp[indptr_fp[i]:indptr_fp[i+1]], data_fp[indptr_fp[i]:indptr_fp[i+1]])
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
	# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


	"""Corpus in the Matrix Market format.

	This code uses python's struct library to read/write binary data

	"""

	import logging
	import numpy as np
	from scipy.sparse import csc_matrix

	logger = logging.getLogger(__name__)


	class MemmapReaderArray(object):
	"""Matrix market file reader, used for :class:`~gensim.corpora.mmcorpus.MmCorpus`.

	Wrap a term-document matrix on disk (in matrix-market format), and present it
	as an object which supports iteration over the rows (~documents).

	Attributes
	----------
	num_docs : int
	number of documents in market matrix file
	num_terms : int
	number of terms
	num_nnz : int
	number of non-zero terms

	Notes
	----------
	Note that the file is read into memory one document at a time, not the whole matrix at once
	(unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM.

	"""

	def __init__(self, input, transposed=True):
	"""

	Parameters
	----------
	input : {str, file-like object}
	Path to input file in MM format or a file-like object that supports `seek()`
	(e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`).

	transposed : bool, optional
	if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value.

	"""
	logger.info("initializing corpus reader from %s", input)
	self.input, self.transposed = input, transposed
	self.num_docs, self.num_terms, self.num_nnz = self.read_headers()

	logger.info(
	"accepted corpus with %i documents, %i features, %i non-zero entries",
	self.num_docs, self.num_terms, self.num_nnz
	)

	def __len__(self):
	"""Get size of corpus (number of documents)."""
	return self.num_docs

	def __str__(self):
	return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
	(self.num_docs, self.num_terms, self.num_nnz))

	def read_headers(self):
	"""Reader header row for file metadata

	Returns
	----------
	num_docs : int
	num_terms : int
	num_nnz : int

	"""
	meta_fp = np.memmap(self.input, dtype='int32', mode='r', shape=(3,))
	num_docs, num_terms, num_nnz = meta_fp[:]
	return num_docs, num_terms, num_nnz

	@staticmethod
	def construct_csc(corpus):
	# matrix term-document (then, csc.indptr[i]:csc.indptr[i+1] will be a slice for i-th document)
	data, row_ind, col_ind = [], [], []
	for (doc_id, doc) in enumerate(corpus):
	for (termid, value) in doc:
	data.append(value)
	row_ind.append(termid)
	col_ind.append(doc_id)
	return csc_matrix((data, (row_ind, col_ind)), dtype='float32')

	@staticmethod
	def save_corpus(fname, corpus):
	logger.info("storing corpus in memmap Matrix Market format to %s", fname)

	csc = MemmapReaderArray.construct_csc(corpus)

	num_terms, num_docs = csc.shape
	num_nnz = csc.nnz

	logger.info(
	"storing corpus with %i documents, %i features, %i non-zero entries",
	num_docs, num_terms, num_nnz
	)

	# write out header info
	meta_fp = np.memmap(fname, dtype='int32', mode='w+', shape=(3,))
	meta_fp[:] = [num_docs, num_terms, num_nnz]
	del meta_fp # this forces flush()

	data_fp = np.memmap(fname+'.data', dtype='float32', mode='w+', shape=csc.data.shape)
	data_fp[:] = csc.data[:]
	del data_fp

	indices_fp = np.memmap(fname+'.indices', dtype='int32', mode='w+', shape=csc.indices.shape)
	indices_fp[:] = csc.indices[:]
	del indices_fp

	indptr_fp = np.memmap(fname+'.indptr', dtype='int32', mode='w+', shape=csc.indptr.shape)
	indptr_fp[:] = csc.indptr[:]
	del indptr_fp

	def __iter__(self):
	"""Iterate through corpus.

	Notes
	------
	Note that the total number of vectors returned is always equal to the number of rows specified
	in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly
	stored in the Matrix Market file.

	Yields
	------
	(int, list of (int, number))
	Document id and Document in BoW format

	"""
	data_fp = np.memmap(self.input + '.data', dtype='float32', mode='r')
	indices_fp = np.memmap(self.input + '.indices', dtype='int32', mode='r')
	indptr_fp = np.memmap(self.input + '.indptr', dtype='int32', mode='r')

	assert self.num_docs == indptr_fp.shape[0] - 1
	for i in range(self.num_docs):
	yield zip(indices_fp[indptr_fp[i]:indptr_fp[i+1]], data_fp[indptr_fp[i]:indptr_fp[i+1]])