Skip to content

Instantly share code, notes, and snippets.

@p-baleine
Created June 20, 2018 21:08
Show Gist options
  • Save p-baleine/bbdc0691d0d0c2ac377142414fe2ee11 to your computer and use it in GitHub Desktop.
Save p-baleine/bbdc0691d0d0c2ac377142414fe2ee11 to your computer and use it in GitHub Desktop.
読書メモ〜『情報検索の基礎』 第1章 2
import glob
import os
import re
import pandas as pd
import string
def tokenize(line):
return line.split()
def normalize(line):
table = line.maketrans('', '', string.punctuation)
return line.translate(table)
def create_incidence_matrix(
document_paths,
tokenize=tokenize,
normalize=normalize):
"""Create incidence matrix from texts under document paths.
"""
docs = []
# Collecting terms.
for p in document_paths:
terms = []
for txt in glob.glob(os.path.join(p, '*.txt')):
with open(txt) as f:
terms.append(sum(
[tokenize(normalize(l)) for l in f], []))
docs.append(list(set(sum(terms, []))))
# Create dictionary.
vocab = [w for w in set(sum(docs, []))]
# Create matrix.
data = dict((os.path.basename(p),
dict((w, 1 if w in doc else 0) for w in vocab))
for p, doc in zip(document_paths, docs))
return pd.DataFrame(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment