Skip to content

Instantly share code, notes, and snippets.

@harryheman
Last active May 26, 2023 05:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save harryheman/96ba975793555e9e10f43786371d610a to your computer and use it in GitHub Desktop.
Save harryheman/96ba975793555e9e10f43786371d610a to your computer and use it in GitHub Desktop.
import _films from './films'
// Наивный
const search = (data: string[], query: string) => {
if (!query) return []
return data.filter((item) =>
item.toLocaleLowerCase().includes(query.toLocaleLowerCase()),
)
}
// Токенизация
const tokenize = (str: string) => str.toLocaleLowerCase().split(/[\s.,!?:;]/)
const search = (data: string[], query: string) => {
if (!search) return []
const terms = tokenize(query)
return data.filter((item) => {
const words = new Set(tokenize(item))
return terms.every((term) => words.has(term))
})
}
// Стемминг
import { stemmer } from 'stemmer-ru'
const stemmerRu = new stemmer()
const tokenize = (str: string) =>
str
.toLocaleLowerCase()
.split(/[\s.,!?:;]/)
.map((w) => stemmerRu.stemWord(w) || w)
.filter(Boolean) as string[]
const search = (data: string[], query: string) => {
if (!search) return []
const terms = tokenize(query)
return data.filter((item) => {
const words = new Set(tokenize(item))
return terms.every((term) => words.has(term))
})
}
// + индексация
const createIndex = (data: string[]) =>
data.map((item) => new Set(tokenize(item)))
const search = (
index: ReturnType<typeof createIndex>,
data: string[],
query: string,
) => {
if (!query) return []
const terms = tokenize(query)
return data.filter((_, i) => terms.every((term) => index[i].has(term)))
}
// + вес
const createIndex = (data: string[]) =>
data.reduce((acc, item, index) => {
tokenize(item).forEach((term) => {
if (!acc.has(term)) {
acc.set(term, new Set())
}
acc.get(term)?.add(index)
})
return acc
}, new Map<string, Set<number>>())
type Index = ReturnType<typeof createIndex>
const getWeights = (data: string[], index: Index) => {
const preSearch = data.map((item) => index.get(item)).filter(Boolean)
const weights = preSearch.reduce((acc, item) => {
item?.forEach((num) => {
const count = acc.get(num) || 0
acc.set(num, count + 1)
})
return acc
}, new Map<number, number>())
const entries = [...weights.entries()]
entries.sort((a, b) => b[1] - a[1])
return entries
.map((row) => ({ index: row[0], weight: row[1] }))
.filter((i) => i.weight === data.length)
}
const search = (index: Index, data: string[], query: string) => {
if (!query) return []
const terms = tokenize(query)
const weights = getWeights(terms, index)
return weights.map((row) => data[row.index])
}
// + n-граммы
const gramm3 = (str: string) => {
return [...(' ' + str)]
.map((w, i, arr) => {
if (i - 2 >= 0) {
return arr[i - 2] + arr[i - 1] + w
}
return null
})
.filter(Boolean) as string[]
}
const tokenize = (str: string) => {
return str
.toLocaleLowerCase()
.split(/[\s\.,!?]/)
.map((w) => stemmerRu.stemWord(w) || w)
.flatMap((w) => gramm3(w as string))
.filter(Boolean) as string[]
}
const getWeights = (data: string[], index: Index) => {
const preSearch = data.map((term) => index.get(term)).filter(Boolean)
const weights = preSearch.reduce((acc, item) => {
item?.forEach((num) => {
const count = acc.get(num) || 0
acc.set(num, count + 1)
})
return acc
}, new Map<number, number>())
const entries = [...weights.entries()]
entries.sort((a, b) => b[1] - a[1])
return entries.map((row) => ({ index: row[0], weight: row[1] }))
}
const search = (index: Index, data: string[], search: string) => {
if (!search) return []
const terms = tokenize(search)
const weights = getWeights(terms, index)
return weights
.filter((row) => row.weight > Math.max(0, terms.length - 2))
.map((row) => data[row.index])
}
// Расстояние Левенштейна (опечатки)
import { distance } from 'fastest-levenshtein'
const tokenize = (v: string) =>
v
.toLocaleLowerCase()
.split(/[,\.: ]/)
.map((w) => stemmerRu.stemWord(w) || w)
const dist = (i: string, q: string) =>
Math.min(...tokenize(i).map((w) => distance(tokenize(q)[0], w)))
const search = (data: string[], query: string) => {
if (!query) return []
return data.filter(
(item) => dist(item, query) <= Math.min(Math.max(query.length - 5, 0), 2),
)
}
// Пример использования
const films = _films.slice(0, 10)
let query = ''
let filteredFilms = []
const form = document.forms[0] as HTMLFormElement
const list = document.querySelector('ul') as HTMLUListElement
const renderFilms = (films: string[]) => {
list.innerHTML = ''
const regexp = new RegExp(query, 'gi')
for (const film of films) {
let listItemTemplate = `<li>${film}</li>`
// if (query.length > 0 && filteredFilms.length > 0) {
// const match = film.match(regexp)
// if (match) {
// listItemTemplate = listItemTemplate.replaceAll(
// regexp,
// `<mark>${match[0]}</mark>`,
// )
// }
// }
list.insertAdjacentHTML('beforeend', listItemTemplate)
}
}
form.addEventListener('submit', (e) => {
e.preventDefault()
const q = (new FormData(form).get('query') as string).trim()
if (q.length === 0) {
renderFilms(films)
return
}
query = q
filteredFilms = search(films, query)
renderFilms(filteredFilms)
})
renderFilms(films)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment