Skip to content

Instantly share code, notes, and snippets.

@shuntaroy
Created March 29, 2022 11:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shuntaroy/0d3672431379c39ddf192fc6270d3207 to your computer and use it in GitHub Desktop.
Save shuntaroy/0d3672431379c39ddf192fc6270d3207 to your computer and use it in GitHub Desktop.
Naive implementations of some classical, information theoretic keyword extraction methods
"""Gamma Index.
Zhou and Slater 2002"""
from typing import List
import numpy as np
import sigma_index as s
def avg_sep(spans: List[int]) -> List[float]:
return [(j + i) / 2 for i,j in zip(spans[1:], spans)]
def delta(d: float, mean: float) -> bool:
if d < mean:
return True
else:
return False
def nu(d: float, mean: float) -> float:
return (mean - d) / mean
def gamma(avg_seps: List[float], mean: float) -> float:
return np.mean([nu(d, mean) for d in avg_seps if delta(d, mean)])
# TODO: need normaise
if __name__ == '__main__':
import sys
import json
from tqdm import tqdm
from collections import Counter
with open(sys.argv[1]) as f:
j = json.load(f)
text = j['body']
text = text.split()
N = len(text)
indices = Counter()
for word in tqdm(set(text)):
poslist = s.extract_occurence(text, word)
spans = s.make_spans(poslist)
spans.insert(0, 0)
spans.insert(-1, N + 1)
n = len(poslist)
avg_seps = avg_sep(spans)
mean = (N + 1) / (n + 1)
indices[word] = gamma(avg_seps, mean)
print(indices.most_common(25))
"""sigma index.
Ortuño, M., Carpena, P., Bernaola-Galván, P., Muñoz, E., & Somoza, A. M. (2002).
Keyword detection in natural languages and DNA. Europhysics Letters (EPL), 57, 759–764."""
from typing import Dict, List
import numpy as np
def extract_occurence(text: List[str], word: str) -> List[int]:
"""Extract positions of occurrences of the input word from the input text.
`text` are assumed to be normalised.
`word` are also assumed to follow the same normalisation of `text`
"""
ret = []
for i, w in enumerate(text):
if w == word:
ret.append(i)
return ret
def make_spans(poslist: List[int]) -> List[int]:
"""Make a list of spans/lengths between word occurence."""
# 6 8 14 20 30 = > 2 6 6 10
zipped = zip(poslist, poslist[1:]) # stripped by the shortest list
return [j - i for i, j in zipped]
def p(x: int, spans: List[int]) -> float:
"""Return the relative frequency of occurrence of a given separation x."""
n = len(spans)
n_i = len([i for i in spans if i == x])
return n_i / n if n_i > 0 else 0
def P(x: int, spans: List[int], x_i: int=1) -> float:
"""Integrated distribution function of p(x)."""
val = 0.0
for i in range(x_i, x + 1):
val += p(i, spans)
return val
def Ps(s: float, spans: List[int]) -> float:
"""Integrated distribution function of p(s) where s is normalised x (= x/mean(x))."""
x = np.mean(spans) * s
print('restored x =', x)
print(f'execute P({int(x)})')
val = P(int(x), spans)
return val
def Ps_rand():
pass
def sigma(n: int, N:int, spans: List[int]) -> float:
"""Sigma index.
Herrera and Pury (2008) version
"""
mean = (N + 1) / (n + 1)
std = np.std(spans)
return std / mean
def sigma_rand(n: int, N:int) -> float:
return np.sqrt(1 - n / N)
def sigma_nor(n: int, N:int, spans: List[int]) -> float:
return sigma(n, N, spans) / sigma_rand(n, N)
if __name__ == '__main__':
# lorem = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."""
#
# poslist = extract_occurence(lorem.lower(), 'in')
# print(poslist)
# spans = make_spans(poslist)
# print(spans)
# set_s = spans / np.mean(spans)
# print(set_s)
# val = P(11, spans)
# print(val)
# print(Ps(0.22, spans))
import sys
import json
from tqdm import tqdm
from collections import Counter
with open(sys.argv[1]) as f:
j = json.load(f)
text = j['body']
text = text.split()
N = len(text)
indices = Counter()
for word in tqdm(set(text)):
poslist = extract_occurence(text, word)
spans = make_spans(poslist)
spans.insert(0, 0)
spans.insert(-1, N + 1)
n = len(poslist)
indices[word] = sigma(n, N, spans)
print(indices.most_common(25))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment