Last active
May 26, 2023 05:23
-
-
Save harryheman/96ba975793555e9e10f43786371d610a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import _films from './films' | |
// Наивный | |
const search = (data: string[], query: string) => { | |
if (!query) return [] | |
return data.filter((item) => | |
item.toLocaleLowerCase().includes(query.toLocaleLowerCase()), | |
) | |
} | |
// Токенизация | |
const tokenize = (str: string) => str.toLocaleLowerCase().split(/[\s.,!?:;]/) | |
const search = (data: string[], query: string) => { | |
if (!search) return [] | |
const terms = tokenize(query) | |
return data.filter((item) => { | |
const words = new Set(tokenize(item)) | |
return terms.every((term) => words.has(term)) | |
}) | |
} | |
// Стемминг | |
import { stemmer } from 'stemmer-ru' | |
const stemmerRu = new stemmer() | |
const tokenize = (str: string) => | |
str | |
.toLocaleLowerCase() | |
.split(/[\s.,!?:;]/) | |
.map((w) => stemmerRu.stemWord(w) || w) | |
.filter(Boolean) as string[] | |
const search = (data: string[], query: string) => { | |
if (!search) return [] | |
const terms = tokenize(query) | |
return data.filter((item) => { | |
const words = new Set(tokenize(item)) | |
return terms.every((term) => words.has(term)) | |
}) | |
} | |
// + индексация | |
const createIndex = (data: string[]) => | |
data.map((item) => new Set(tokenize(item))) | |
const search = ( | |
index: ReturnType<typeof createIndex>, | |
data: string[], | |
query: string, | |
) => { | |
if (!query) return [] | |
const terms = tokenize(query) | |
return data.filter((_, i) => terms.every((term) => index[i].has(term))) | |
} | |
// + вес | |
const createIndex = (data: string[]) => | |
data.reduce((acc, item, index) => { | |
tokenize(item).forEach((term) => { | |
if (!acc.has(term)) { | |
acc.set(term, new Set()) | |
} | |
acc.get(term)?.add(index) | |
}) | |
return acc | |
}, new Map<string, Set<number>>()) | |
type Index = ReturnType<typeof createIndex> | |
const getWeights = (data: string[], index: Index) => { | |
const preSearch = data.map((item) => index.get(item)).filter(Boolean) | |
const weights = preSearch.reduce((acc, item) => { | |
item?.forEach((num) => { | |
const count = acc.get(num) || 0 | |
acc.set(num, count + 1) | |
}) | |
return acc | |
}, new Map<number, number>()) | |
const entries = [...weights.entries()] | |
entries.sort((a, b) => b[1] - a[1]) | |
return entries | |
.map((row) => ({ index: row[0], weight: row[1] })) | |
.filter((i) => i.weight === data.length) | |
} | |
const search = (index: Index, data: string[], query: string) => { | |
if (!query) return [] | |
const terms = tokenize(query) | |
const weights = getWeights(terms, index) | |
return weights.map((row) => data[row.index]) | |
} | |
// + n-граммы | |
const gramm3 = (str: string) => { | |
return [...(' ' + str)] | |
.map((w, i, arr) => { | |
if (i - 2 >= 0) { | |
return arr[i - 2] + arr[i - 1] + w | |
} | |
return null | |
}) | |
.filter(Boolean) as string[] | |
} | |
const tokenize = (str: string) => { | |
return str | |
.toLocaleLowerCase() | |
.split(/[\s\.,!?]/) | |
.map((w) => stemmerRu.stemWord(w) || w) | |
.flatMap((w) => gramm3(w as string)) | |
.filter(Boolean) as string[] | |
} | |
const getWeights = (data: string[], index: Index) => { | |
const preSearch = data.map((term) => index.get(term)).filter(Boolean) | |
const weights = preSearch.reduce((acc, item) => { | |
item?.forEach((num) => { | |
const count = acc.get(num) || 0 | |
acc.set(num, count + 1) | |
}) | |
return acc | |
}, new Map<number, number>()) | |
const entries = [...weights.entries()] | |
entries.sort((a, b) => b[1] - a[1]) | |
return entries.map((row) => ({ index: row[0], weight: row[1] })) | |
} | |
const search = (index: Index, data: string[], search: string) => { | |
if (!search) return [] | |
const terms = tokenize(search) | |
const weights = getWeights(terms, index) | |
return weights | |
.filter((row) => row.weight > Math.max(0, terms.length - 2)) | |
.map((row) => data[row.index]) | |
} | |
// Расстояние Левенштейна (опечатки) | |
import { distance } from 'fastest-levenshtein' | |
const tokenize = (v: string) => | |
v | |
.toLocaleLowerCase() | |
.split(/[,\.: ]/) | |
.map((w) => stemmerRu.stemWord(w) || w) | |
const dist = (i: string, q: string) => | |
Math.min(...tokenize(i).map((w) => distance(tokenize(q)[0], w))) | |
const search = (data: string[], query: string) => { | |
if (!query) return [] | |
return data.filter( | |
(item) => dist(item, query) <= Math.min(Math.max(query.length - 5, 0), 2), | |
) | |
} | |
// Пример использования | |
const films = _films.slice(0, 10) | |
let query = '' | |
let filteredFilms = [] | |
const form = document.forms[0] as HTMLFormElement | |
const list = document.querySelector('ul') as HTMLUListElement | |
const renderFilms = (films: string[]) => { | |
list.innerHTML = '' | |
const regexp = new RegExp(query, 'gi') | |
for (const film of films) { | |
let listItemTemplate = `<li>${film}</li>` | |
// if (query.length > 0 && filteredFilms.length > 0) { | |
// const match = film.match(regexp) | |
// if (match) { | |
// listItemTemplate = listItemTemplate.replaceAll( | |
// regexp, | |
// `<mark>${match[0]}</mark>`, | |
// ) | |
// } | |
// } | |
list.insertAdjacentHTML('beforeend', listItemTemplate) | |
} | |
} | |
form.addEventListener('submit', (e) => { | |
e.preventDefault() | |
const q = (new FormData(form).get('query') as string).trim() | |
if (q.length === 0) { | |
renderFilms(films) | |
return | |
} | |
query = q | |
filteredFilms = search(films, query) | |
renderFilms(filteredFilms) | |
}) | |
renderFilms(films) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment