Skip to content

Instantly share code, notes, and snippets.

@yukirin
Created March 2, 2015 15:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yukirin/c7f7381565554e588d12 to your computer and use it in GitHub Desktop.
Save yukirin/c7f7381565554e588d12 to your computer and use it in GitHub Desktop.
[python]文字列の類似度の計算
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import math
import string
from nltk.corpus import (reuters, stopwords)
from nltk.stem import LancasterStemmer
from nltk import TextCollection
def preprocess(corpus):
stopset = set(stopwords.words('english'))
stemmer = LancasterStemmer()
normalized_corpus = []
for words in corpus:
stemmed_words = []
for word in words:
word = stemmer.stem(word.lower())
if len(word) > 2 and word not in stopset:
stemmed_words += [word]
normalized_corpus += [stemmed_words]
return normalized_corpus
corpus = (reuters.words(fileids=f) for f in reuters.fileids()[:10])
corpus = preprocess(corpus)
col = TextCollection(corpus)
# TF-IDF Cos類似度
def tf(word, words):
return float(words.count(word)) / len(words) if words else 0
def idf(word, corpus):
count = len([True for words in corpus if word in words])
return math.log(len(corpus) / count)
def tf_idf(word, words, corpus):
id_freq = idf(word, corpus)
return tf(word, words) * id_freq if id_freq else 0
def cos_sim(v1, v2):
norm = math.sqrt(sum(x * x for x in v1.values()) * sum(x * x for x in v2.values()))
ip = sum(v1[key] * v2[key] for key in v1 if key in v2)
return float(ip) / norm if norm else 0
print("TextCollection.tf_idf -> ", col.tf_idf('export', corpus[3]))
print("tf_idf -> ", tf_idf('export', corpus[3], corpus))
v1 = {word: col.tf_idf(word, corpus[0]) for word in corpus[0]}
v2 = {word: col.tf_idf(word, corpus[1]) for word in corpus[1]}
print("TF-IDF Cos -> ", cos_sim(v1, v2))
# jaccard係数(重み付き)
# http://sucrose.hatenablog.com/entry/2012/11/30/132803
def jaccard_weight(v1, v2):
numerator = 0
denominator = 0
keys = set(v1.keys())
keys.update(v2.keys())
for k in keys:
f1 = v1.get(k, 0)
f2 = v2.get(k, 0)
numerator += min(f1, f2)
denominator += max(f1, f2)
return float(numerator) / denominator if denominator != 0 else 0
v1 = {word: col.tf_idf(word, corpus[0]) for word in corpus[0]}
v2 = {word: col.tf_idf(word, corpus[1]) for word in corpus[1]}
print("Jaccard weight -> ", jaccard_weight(v1, v2))
# レーベンシュタイン距離
def levenshtein_distance(str1, str2):
len1 = len(str1)
len2 = len(str2)
table = [[0] * (len2 + 1) for __ in range(len1 + 1)]
for i in range(len2 + 1):
table[0][i] = i
for i in range(len1 + 1):
table[i][0] = i
for i in range(1, len1 + 1):
for j in range(1, len2 + 1):
insert = table[i - 1][j] + 1
delete = table[i][j - 1] + 1
replace = table[i - 1][j - 1] + (0 if str1[i - 1] == str2[j - 1] else 1)
table[i][j] = min(insert, delete, replace)
return table[-1][-1]
print("Levenshtein distance -> ", levenshtein_distance('apple', 'play'))
print("Levenshtein distance -> ", levenshtein_distance('perl', 'pearl'))
# Soundex
def soundex(word):
soundex_digits = "01230120022455012623010202"
word = re.sub(r'[^A-Z]', '', word.upper())
if not word: return ''
first = word[0]
word = first + re.sub(r'[HW]', '', word[1:])
word = word.translate(str.maketrans(string.ascii_uppercase, soundex_digits))
word = (first + re.sub(r'(\d)\1+', r'\1', word)[1:]).replace('0', '')
return (word + '000')[:4]
print("Soundex -> ", soundex('Christiansen'))
print("Soundex -> ", soundex('Bi.rthday'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment