p-baleine/utils.py

## utils.py
import glob
import os
import re
import pandas as pd
import string

def tokenize(line):
    return line.split()

def normalize(line):
    table = line.maketrans('', '', string.punctuation)
    return line.translate(table)

def create_incidence_matrix(
        document_paths,
        tokenize=tokenize,
        normalize=normalize):
    """Create incidence matrix from texts under document paths.
    """

    docs = []

    # Collecting terms.
    for p in document_paths:
        terms = []

        for txt in glob.glob(os.path.join(p, '*.txt')):
            with open(txt) as f:
                terms.append(sum(
                    [tokenize(normalize(l)) for l in f], []))

        docs.append(list(set(sum(terms, []))))

    # Create dictionary.
    vocab = [w for w in set(sum(docs, []))]

    # Create matrix.
    data = dict((os.path.basename(p),
                 dict((w, 1 if w in doc else 0) for w in vocab))
                for p, doc in zip(document_paths, docs))

    return pd.DataFrame(data)
	import glob
	import os
	import re
	import pandas as pd
	import string

	def tokenize(line):
	return line.split()

	def normalize(line):
	table = line.maketrans('', '', string.punctuation)
	return line.translate(table)

	def create_incidence_matrix(
	document_paths,
	tokenize=tokenize,
	normalize=normalize):
	"""Create incidence matrix from texts under document paths.
	"""

	docs = []

	# Collecting terms.
	for p in document_paths:
	terms = []

	for txt in glob.glob(os.path.join(p, '*.txt')):
	with open(txt) as f:
	terms.append(sum(
	[tokenize(normalize(l)) for l in f], []))

	docs.append(list(set(sum(terms, []))))

	# Create dictionary.
	vocab = [w for w in set(sum(docs, []))]

	# Create matrix.
	data = dict((os.path.basename(p),
	dict((w, 1 if w in doc else 0) for w in vocab))
	for p, doc in zip(document_paths, docs))

	return pd.DataFrame(data)