Skip to content

Instantly share code, notes, and snippets.

@Tenderfeel
Created August 23, 2021 06:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tenderfeel/a5fb2f55de4ba57c96eae5f950b256ea to your computer and use it in GitHub Desktop.
Save Tenderfeel/a5fb2f55de4ba57c96eae5f950b256ea to your computer and use it in GitHub Desktop.
Markdonファイルから検索用キーワードを生成する
/**
* /src/data/blog/ にある markdownファイルと
* /src/data/json/author.json から キーワードを抽出した
* flexsearch_index.json を生成する
*/
import { readdir, readFile, writeFile } from 'fs/promises'
import path from 'path'
import { tokenize } from 'kuromojin'
import removeMarkdown from 'remove-markdown'
import grayMatter from 'gray-matter'
import chalk from 'chalk'
import ora from 'ora'
const MD_DIRECTORY_PATH = './src/data/blog'
const AUTHOR_JSON_PATH = './src/data/json/author.json'
const DIST_FILE_PATH = './src/data/flexsearch_index.json'
const DICT_DIRECTORY_PATH = './src/data/dict'
const spinner = ora(`${chalk.bold('flexsearch_index.json create')}`).start(
`Read ${chalk.bold(MD_DIRECTORY_PATH)} directory`
)
try {
const files = await readdir(MD_DIRECTORY_PATH, { withFileTypes: true })
const filterFiles = files.filter((file) => file.isFile())
spinner
.succeed(`Read ${chalk.bold(files.length)} markdown files.`)
.start(`Read ${chalk.bold('author.json')}`)
const author = await readFile(path.resolve(AUTHOR_JSON_PATH))
const authorJSON = JSON.parse(author.toString('utf8'))
const documents = []
spinner.succeed().start('Tokenize markdown files...')
// Promise.allだと処理状況見れないので…
for (let i = 0; i < filterFiles.length; i++) {
const file = filterFiles[i]
spinner.start(`Start ${chalk.bold(`${file.name} tokenize`)} prosess...`)
documents.push(await parseMarkdownFile(file, authorJSON))
spinner.succeed(`${chalk.bold(`${file.name}`)} tokenized`)
}
spinner
.succeed(`complete ${chalk.bold('tokenize')} prosess!`)
.start('JSON file export...')
await writeFile(
DIST_FILE_PATH,
JSON.stringify({
documents,
})
)
spinner.succeed('JSON file export')
} catch (err) {
spinner.fail()
console.error(err)
}
/**
* Markdownファイルの中身から必要な情報を抽出
*/
async function parseMarkdownFile(file, authorJSON) {
const fp = path.join(MD_DIRECTORY_PATH, file.name)
const markdown = await readFile(fp)
const matter = grayMatter(markdown)
const content = removeMarkdown(matter.content)
const author = authorJSON.find((auth) => auth.id === matter.data.author)
const str =
`${matter.data.title}\n\n` +
content
const document = {
id: matter.data.id,
title: matter.data.title,
date: matter.data.date,
author: `${author?.name}` + (author?.name_reading ? `(${author?.name_reading})` :''),
tag: matter.data.tags,
keywords: '',
}
const tokens = await tokenize(str, {
dicPath: path.resolve(DICT_DIRECTORY_PATH),
})
document.keywords = createKeywords(tokens)
return document
}
/**
* キーワード生成
*/
function createKeywords(tokens) {
const allTokens = []
tokens.filter(tokenFilter).forEach((token) => {
// 表層形
if (!allTokens.includes(token.surface_form)) {
allTokens.push(token.surface_form)
}
// 読み
const reading = token.reading || token.surface_form
// ひらがな変換
const hira = reading.replace(/[\u30A2-\u30F3]/g, (m) =>
String.fromCharCode(m.charCodeAt(0) - 96)
)
// 基本形
if (
token.surface_form !== token.basic_form &&
token.basic_form !== '*' &&
!allTokens.includes(token.basic_form)
) {
allTokens.push(token.basic_form)
}
// ひらがな
if (token.surface_form !== hira && !allTokens.includes(hira)) {
allTokens.push(hira)
}
})
return allTokens.join(' ')
}
/**
* キーワードのフィルタ
*/
function tokenFilter(token) {
const keywords = ['GATSBYEMPTYALT', 'GATSBYEMPTYALTPresenter']
if (
!['名詞', '動詞', '形容詞'].includes(token.pos) ||
/^[!-/:-@[-`{-~、-〜”’・.,_\s\u02B0-\u02FF\u2010-\u27FF\u3001-\u303F\uFF01-\uFF0F\uFF1A-\uFF1E\uFF3B-\uFF40\uFF5B-\uFF65]+$/g.test(
token.surface_form
) ||
token.surface_form.length < 2 ||
keywords.includes(token.surface_form)
) {
return false
}
switch (token.pos) {
case '名詞':
case '形容詞':
return true
case '動詞':
return !['基本形', '連用形', '仮定形'].includes(token.conjugated_form)
break
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment