paithiov909/R-benchmark-25.R

## animate-population-akita.R
akita <- jisx0402::municipality |>
  dplyr::filter(
    pref_code == "05",
    is.na(end_date),
    !stringr::str_ends(name, "郡")
  ) |>
  dplyr::mutate(
    muni_code = paste0(pref_code, city_code),
    muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
    start_year = lubridate::year(start_date)
  ) |>
  dplyr::select(muni_code, name, start_year) |>
  dplyr::arrange(desc(start_year))

year <- max(akita$start_year, na.rm = TRUE)

dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |>
  dplyr::filter(.data$year >= .env$year) |>
  dplyr::right_join(akita, by = c("code" = "muni_code"))

require(ggplot2)

dat |>
  dplyr::filter(name == "秋田市", age == "all") |>
  ggplot(aes(year, population)) +
  geom_line(aes(colour = sex)) +
  scale_y_log10()

ga <- jisx0402::jptopography("designated") |>
  dplyr::filter(stringr::str_starts(muni_code, "05")) |>
  rlang::as_function(
    ~ dplyr::mutate(.,
        muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
        area = units::set_units(sf::st_area(.), km^2)
      )
  )() |>
  dplyr::right_join(
    dplyr::filter(dat, age == "all"),
    by = c("muni_code" = "code")
  ) |>
  dplyr::group_by(sex) |>
  dplyr::mutate(
    denst = round(population / area),
    rank = dplyr::percent_rank(denst) * 10
  ) |>
  dplyr::ungroup() |>
  ggplot() +
  geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
  facet_wrap(~ sex) +
  labs(
    title = "Rank of Population Density in Akita, throughout 2006-2023",
    subtitle = "Year: {frame_time}",
    caption = paste(
      "地図データ：「国土数値情報 行政区域データ」（国土交通省）",
      "https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
      sep = "\n"
    )
  ) +
  theme_light() +
  gganimate::transition_time(year)

gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
gganimate::anim_save("akita-denst1.mp4", path = "Videos")

## animate-population-tokyo.R
tokyo <- jisx0402::municipality |>
  dplyr::filter(
    pref_code == "13",
    is.na(end_date),
    !stringr::str_ends(name, "郡|(特別区)|(支庁)")
  ) |>
  dplyr::mutate(
    muni_code = paste0(pref_code, city_code),
    muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
    start_year = lubridate::year(start_date)
  ) |>
  dplyr::select(muni_code, name, start_year) |>
  dplyr::arrange(desc(start_year))

year <- max(tokyo$start_year, na.rm = TRUE)

dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |>
  dplyr::filter(.data$year >= .env$year) |>
  dplyr::right_join(tokyo, by = c("code" = "muni_code"))

require(ggplot2)

dat |>
  dplyr::filter(age == "all", stringr::str_detect(name, "区"),
                year > 2005L) |>
  ggplot(aes(year, population)) +
  geom_line(aes(colour = sex)) +
  facet_wrap(~ name) +
  scale_y_log10()

ga <- jisx0402::jptopography("all", resolution = 1) |>
  dplyr::filter(stringr::str_starts(muni_code, "13")) |>
  rlang::as_function(
    ~ dplyr::mutate(.,
        muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
        area = units::set_units(sf::st_area(.), km^2)
      )
  )() |>
  dplyr::right_join(
    dplyr::filter(dat, age == "all", stringr::str_detect(name, "区")),
    by = c("muni_code" = "code")
  ) |>
  dplyr::group_by(sex) |>
  dplyr::mutate(
    denst = round(population / area),
    rank = dplyr::percent_rank(denst) * 10
  ) |>
  dplyr::ungroup() |>
  ggplot() +
  geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
  facet_wrap(~ sex) +
  labs(
    title = "Rank of Population Density in Tokyo Special Wards, throughout 2006-2023",
    subtitle = "Year: {frame_time}",
    caption = paste(
      "地図データ：「国土数値情報 行政区域データ」（国土交通省）",
      "https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
      sep = "\n"
    )
  ) +
  theme_light() +
  gganimate::transition_time(year, range = c(2006L, 2023L))

gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
gganimate::anim_save("tokyo23-denst1.mp4", path = "Videos")

## asciiart.R
#' @export
accelerate <- function(text = "精神を加速させろ") {
  r"(
　　　　　　　　　　　　∩＿∩
　　　　　　　　　　 ／ ＼ ／＼
　　　　　　　　　　|（ﾟ）=（ﾟ）|
　　　　　　　　　　|　　●_●　 |
　　　　　　　　　/　　　　　　ヽ
　　　　　　　r⌒| 〃 ------ ヾ　|
　　　　　　/　　i／　|＿二＿＿ノ
　　　　　./　 ／　　/　　　　　　) 　　　  {text}
　　　　 ./ ／　　／　　　　 　/／
　　　　/　　　.／　　　　　/￣
　　　 .ヽ､__.／　　　　 ／ ⌒ヽ
　　 　　　　 r　　　 ／　 　　　|
　　　　　　/　　 　　　　　　　ﾉ
　　　　　/　　　　　 /　　　 /
　　　　./　　　　／/　　 ／
　　　 /.　　　.／ ./　 ／
　　　i　　　／　./ ／
　　　i　　./　.ノ.＾/
　　　i　 ./　　|_／
　　　i　/
　　／ /
　 (_／
  )" |> stringi::stri_replace_all_fixed("　", "  ") |> glue::glue()
}

#' @export
qb <- function(text = "わけがわからないよ") {
  r"(
　 　 　 　 |＼　　 　 　 　 　 ／|
　 　 　 　 |＼＼　　 　 　 ／／|
　　　　 　 : 　,>　｀´￣｀´　<　 ′
.　　　　 　 Ｖ　 　 　 　 　 　 Ｖ
.　　　　 　 i<　●　 　 　 ●　>i
　　　　 　 八　 　 ､_,_, 　 　 八 　　　{text}
. 　 　 　 /　个 . ＿　 ＿ . 个 ',
　　　＿/ 　 il 　 ,'　　　 '.　 li　 ',＿_
  )" |> stringi::stri_replace_all_fixed("　", "  ") |> glue::glue()
}

## aznyan.R
#!/usr/bin/env Rscript

ID <- commandArgs(trailingOnly = TRUE)[1]
TYPE <- commandArgs(trailingOnly = TRUE)[2]

aznyan::get_lyrics_list(ID, ifelse(!is.na(TYPE), TYPE)) |>
aznyan::get_lyrics(paste0(ID, ".csv"))

## inuzaru-vocab-2.R

dat <-
  readr::read_csv("data/lower_care.csv") |>
  dplyr::mutate(
    issue = factor(issue),
    speaker = factor(speaker),
    speakerGroup = factor(speakerGroup),
    speech = stringi::stri_replace_all_regex(speech, "\\s+", "") |>
      stringi::stri_trans_nfkc()
  )

# 名詞
toks <- dat |>
  dplyr::select(speechID, speech, speaker, speakerGroup) |>
  gibasa::tokenize(
    speech,
    speechID,
    sys_dic = here::here("sudachidict/build")
  ) |>
  gibasa::prettify(
    into = gibasa::get_dict_features("sudachi"),
    col_select = c("POS1", "normalized_form", "reading_form")
  ) |>
  dplyr::mutate(
    reading_form = stringi::stri_replace_all_regex(
      reading_form,
      "[ァィゥェォャュョ]",
      ""
    ),
    mora = stringi::stri_length(reading_form),
    .by = doc_id
  )

noun <- toks |>
  dplyr::reframe(
    tok = gibasa::ngram_tokenizer(3)(token, sep = "-"),
    pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"),
    .by = doc_id
  ) |>
  dplyr::filter(
    pos == "名詞-助詞-名詞",
    stringi::stri_detect_fixed(tok, "-の-")
  ) |>
  dplyr::distinct(tok) |>
  dplyr::pull(tok) |>
  stringi::stri_extract_first_regex("^([[:alpha:]]+)")

meishi <- toks |>
  dplyr::count(doc_id, token, mora) |>
  tidytext::bind_tf_idf(token, doc_id, n) |>
  dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) |>
  dplyr::filter(dplyr::percent_rank(tf_idf) > .80) |>
  dplyr::distinct(token, mora)

readr::write_csv(
  meishi,
  "vocab_n.csv"
)

# 述部
ch <-
  stringi::stri_split_boundaries(
    dat$speech,
    opts_brkiter = stringi::stri_opts_brkiter(
      locale = "ja@ld=auto;lw=phrase"
    )
  ) |>
  purrr::map(
    ~ stringi::stri_subset_regex(., "。")
  ) |>
  unlist() |>
  unique() |>
  stringi::stri_replace_all_fixed("。", "") |>
  purrr::keep(~ stringi::stri_length(.) > 2)

len <-
  gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) |>
  gibasa::prettify(
    into = gibasa::get_dict_features("sudachi"),
    col_select = "reading_form"
  ) |>
  gibasa::as_tokens(reading_form, pos_field = NULL) |>
  purrr::map(
    ~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") |>
      stringi::stri_length() |>
      sum()
  ) |>
  unlist()

jutsubu <-
  tibble::tibble(
    word = ch,
    mora = len
  ) |>
  dplyr::filter(
    !gibasa::is_blank(word),
    mora > 4, mora < 8
  ) |>
  dplyr::arrange(mora)

readr::write_csv(
  jutsubu,
  "vocab_j.csv"
)

# 修飾部
shushoku <- toks |>
  dplyr::filter(
    POS1 %in% c("形容詞")
  ) |>
  dplyr::distinct(normalized_form, .keep_all = TRUE) |>
  dplyr::select(token, mora) |>
  dplyr::arrange(mora)

readr::write_csv(
  shushoku,
  "vocab_s.csv"
)

## inuzaru-vocab.R

dat <-
  readxl::read_xlsx(
    "data/39.xlsx",
    col_names = c(
      "title", "artist", "lyricist", "composer",
      "leading", "link", "source", "text",
      "release", "view"
    ),
    na = "NA"
  ) |>
  dplyr::reframe(
    text = stringi::stri_split_regex(text, "[\\s]{2,}") |>
      unlist() |>
      stringi::stri_replace_all_regex("[\\n]+", "。") |>
      stringi::stri_replace_all_regex("[「」（）\\(\\)]", "") |>
      stringi::stri_trans_tolower(),
    .by = link
  )

# 名詞
toks <- dat |>
  gibasa::tokenize(
    text,
    link,
    sys_dic = here::here("sudachidict/build")
  ) |>
  gibasa::prettify(
    into = gibasa::get_dict_features("sudachi"),
    col_select = c("POS1", "normalized_form", "reading_form")
  ) |>
  dplyr::mutate(
    reading_form = stringi::stri_replace_all_regex(
      reading_form,
      "[ァィゥェォャュョ]",
      ""
    ),
    mora = stringi::stri_length(reading_form),
    .by = doc_id
  )

noun <- toks |>
  dplyr::reframe(
    tok = gibasa::ngram_tokenizer(3)(token, sep = "-"),
    pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"),
    .by = doc_id
  ) |>
  dplyr::filter(
    pos == "名詞-助詞-名詞",
    stringi::stri_detect_fixed(tok, "-の-")
  ) |>
  dplyr::distinct(tok) |>
  dplyr::pull(tok) |>
  stringi::stri_extract_first_regex("^([[:alpha:]]+)")

meishi <- toks |>
  dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) |>
  dplyr::distinct(token, mora)

readr::write_csv(
  meishi,
  "vocab_n.csv"
)

# 述部
ch <-
  stringi::stri_split_boundaries(
    dat$text,
    opts_brkiter = stringi::stri_opts_brkiter(
      locale = "ja@ld=auto;lw=phrase"
    )
  ) |>
  purrr::map(
    ~ stringi::stri_subset_regex(., "。")
  ) |>
  unlist() |>
  unique() |>
  stringi::stri_replace_all_fixed("。", "") |>
  purrr::keep(~ stringi::stri_length(.) > 2)

len <-
  gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) |>
  gibasa::prettify(
    into = gibasa::get_dict_features("sudachi"),
    col_select = "reading_form"
  ) |>
  gibasa::as_tokens(reading_form, pos_field = NULL) |>
  purrr::map(
    ~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") |>
      stringi::stri_length() |>
      sum()
  ) |>
  unlist()

jutsubu <-
  tibble::tibble(
    word = ch,
    mora = len
  ) |>
  dplyr::filter(
    !gibasa::is_blank(word),
    mora > 2, mora < 8
  ) |>
  dplyr::arrange(mora)

readr::write_csv(
  jutsubu,
  "vocab_j.csv"
)

# 修飾部
shushoku <- toks |>
  dplyr::filter(
    POS1 %in% c("形容詞", "副詞")
  ) |>
  dplyr::distinct(normalized_form, .keep_all = TRUE) |>
  dplyr::select(token, mora) |>
  dplyr::arrange(mora)

readr::write_csv(
  shushoku,
  "vocab_s.csv"
)

## kagome.R
#' Environment for internal use
#'
#' @noRd
#' @keywords internal
.env <- rlang::env(server i= NULL)

#' @noRd
reset_encoding <- function(chr, encoding = "UTF-8") {
  Encoding(chr) <- encoding
  return(chr)
}

#' Call lattice command
#'
#' A debug tool of tokenize process outputs a lattice in graphviz dot format.
#'
#' @param sentences Character vector.
#' @param ... Other arguments are passed to \code{DiagrammeR::grViz}.
#'
#' @export
get_lattice <- function(sentences, ...) {
  senteces <- stringi::stri_omit_na(sentences)
  dot <- processx::run("kagome", c("lattice", stringi::stri_c(sentences, collapse = " ")))$stdout
  DiagrammeR::grViz(reset_encoding(dot), ...)
}

#' Lanch Kagome Server
#'
#' Start or kill Kagome server process.
#' @param dict Dictionary which kagome server uses. Default value is 'ipa' (IPA-dictionary).
#' @return Logical value (whether or not the kagome server is alive?) is returned invisibly.
#'
#' @export
lanch_server <- function(dict = c("ipa", "uni")) {
  p <- rlang::env_get(.env, "server")
  if (is.null(p)) {
    dict <- rlang::arg_match(dict)
    p <- processx::process$new("kagome", c("server", "-dict", dict))

    rlang::env_bind(.env, server = p)
  } else {
    if (p$is_alive()) {
      kill <- yesno::yesno("Kagome server is already alive. Do you want to kill its process?")
    }
    if (kill) {
      p$kill()
      rlang::env_bind(.env, server = NULL)
    }
  }
  return(invisible(p$is_alive()))
}

#' Send a HEAD request to Kagome server
#'
#' @param url URL Character scalar.
#' @return httr2 response is returned invisibly.
#'
#' @export
ping <- function(url = "http://localhost:6060") {
  resp <-
    httr2::request(url) |>
    httr2::req_method("HEAD") |>
    httr2::req_perform()
  return(invisible(resp))
}

#' Put request to tokenize API
#'
#' @param sentences Character vector to be analyzed.
#' @param url URL of Kagome server.
#' @param mode One of `normal`, `search` or `extended`.
#' @return tibble
#'
#' @export
tokenize <- function(sentences,
                     url = "http://localhost:6060/tokenize",
                     mode = c("normal", "search", "extended")) {
  mode <- rlang::arg_match(mode)

  sentences <-
    stringi::stri_omit_na(sentences) |>
    stringi::stri_split_boundaries(type = "sentence") |>
    purrr::flatten_chr()

  resps <-
    furrr::future_imap_dfr(sentences, ~ tokenize_impl(.x, .y, url, mode)) |>
    dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "*"))) |>
    dplyr::mutate_at(c("doc_id", "class"), as.factor) |>
    tibble::as_tibble()

  return(resps)
}

#' @noRd
tokenize_impl <- function(msg, idx, url = "http://localhost:6060/tokenize", mode = "normal") {
  resp <-
    httr2::request(url) |>
    httr2::req_body_json(list(
      sentence = msg,
      mode = mode
    )) |>
    httr2::req_method("PUT") |>
    httr2::req_error(function(resp) httr2::resp_status(resp) > 400) |>
    httr2::req_perform()

  return(
    purrr::map_dfr(
      httr2::resp_body_json(resp)$tokens,
      ~ data.frame(
        doc_id = idx,
        id = .$id,
        start = .$start,
        end = .$end,
        class = .$class,
        token = .$surface,
        POS1 = .$pos[[1]],
        POS2 = .$pos[[2]],
        POS3 = .$pos[[3]],
        POS4 = .$pos[[4]],
        Original = .$base_form,
        Yomi1 = .$reading,
        Yomi2 = .$pronunciation
      )
    )
  )
}

.on_unload <- function(ns) {
  p <- rlang::env_get(.env, "server")
  if (!is.null(p) && p$is_alive()) {
    p$kill()
  }
}

## lexrank.R
texts <- ldccr::read_aozora(
  "https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip",
  directory = tempdir()
)

sent <- readr::read_lines(texts) |>
  gibasa::tokenize(split = TRUE) |>
  gibasa::prettify() |>
  dplyr::filter(!POS1 %in% c("記号")) |>
  dplyr::group_by(doc_id, sentence_id) |>
  dplyr::reframe(
    text = paste0(token, collapse = " ")
  ) |>
  dplyr::mutate(
    doc_id = doc_id,
    sentence_id = paste(doc_id, sentence_id, sep = "_")
  )

### lexRankr ----

tok <- sent |>
  tidytext::unnest_tokens(
    token,
    text,
    token = \(x) strsplit(x, " ", fixed = TRUE)
  )

simil_df <- lexRankr::sentenceSimil(
  sentenceId = tok$sentence_id,
  token = tok$token,
  docId = tok$doc_id
)

top_n_sent <-
  lexRankr::lexRankFromSimil(
    simil_df$sent1,
    simil_df$sent2,
    simil = simil_df$similVal,
    n = 10,
    usePageRank = TRUE,
    continuous = TRUE,
    # threshold = 0.1,
    returnTies = FALSE
  )

top_n_sent |>
  dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
  dplyr::arrange(desc(value)) |>
  dplyr::as_tibble()

### replace sentenceSimil ----

mat <-
  sent |>
  tidytext::unnest_tokens(
    term,
    text,
    token = \(x) strsplit(x, " ", fixed = TRUE)
  ) |>
  dplyr::count(doc_id, term) |>
  tidytext::bind_tf_idf(term, doc_id, n) |>
  dplyr::mutate(tf_idf = n * (idf + 1)) |>
  dplyr::inner_join(
    sent |>
      tidytext::unnest_tokens(
        term,
        text,
        token = \(x) strsplit(x, " ", fixed = TRUE)
      ) |>
      dplyr::select(doc_id, sentence_id, term),
    by = c("doc_id" = "doc_id", "term" = "term")
  ) |>
  tidytext::cast_sparse(sentence_id, term, tf_idf)

dt <- proxyC::simil(
  mat,
  method = "cosine",
  # min_simil = 0.1,
  # rank = 50,
  use_nan = FALSE
)

dt |>
  tidytext:::tidy.dfm() |>
  dplyr::rename(
    s1 = document,
    s2 = term,
    weight = count
  ) |>
  (\(simil_df) {
    lexRankr::lexRankFromSimil(
      simil_df$s1,
      simil_df$s2,
      simil = simil_df$weight,
      n = nrow(simil_df),
      usePageRank = TRUE,
      continuous = TRUE,
      # threshold = 0.1,
      returnTies = TRUE
    )
  })() |>
  dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
  dplyr::arrange(desc(value)) |>
  dplyr::as_tibble()

### replace lexRankFromSimil ----

dt |>
  tidytext:::tidy.dfm() |>
  dplyr::rename(
    s1 = document,
    s2 = term,
    weight = count
  ) |>
  (\(simil_df) {
    pr <- simil_df |>
      igraph::graph_from_data_frame(
        directed = FALSE
      ) |>
      igraph::page_rank(
        directed = FALSE,
        damping = .85
      ) |>
      purrr::pluck("vector")
    tibble::tibble(
      sentenceId = names(pr),
      value = unname(pr)
    )
  })() |>
  dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
  dplyr::arrange(desc(value)) |>
  dplyr::as_tibble()

## poem-score.R
require(sudachir)
require(apportita)

strj_split_boundaries <- \(x) {
  stringi::stri_split_boundaries(
    x,
    opts_brkiter = stringi::stri_opts_brkiter(
      locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere
    )
  )
}

split_kugire <- \(x) {
  strj_split_boundaries(x) |>
    purrr::map(\(elem) {
      len <- length(elem)
      if (len < 2) {
        return(NA_character_)
      } else {
        sapply(seq_len(len - 1), \(i) {
          s1 <- paste0(elem[1:i], collapse = "")
          s2 <- paste0(elem[(i + 1):len], collapse = "")
          paste(s1, s2, sep = "\t")
        })
      }
    }) |>
    unlist()
}

sudachi <- sudachir::rebuild_tokenizer(mode = "C")
form <- \(x) {
  unlist(sudachir::form(x, type = "normalized", pos = FALSE,
                        instance = sudachi))
}
wrd <- \(conn, s1, s2) {
  purrr::map2_dbl(s1, s2, \(el1, el2) {
    el1 <- form(el1)
    el2 <- form(el2)
    apportita::calc_wrd(conn, el1, el2)
  })
}

conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude")
dim(conn)


### tanka72 ----
dat <-
  readxl::read_xlsx("data/tanka/tanka72.xlsx") |>
  dplyr::transmute(
    id = factor(id),
    body = audubon::strj_normalize(body) |>
      stringr::str_remove_all("[^[:alnum:]]+"),
    author = factor(author)
  )

ret <- dat |>
  dplyr::reframe(
    id = id,
    phrase = split_kugire(body),
    author = author,
    .by = id
  ) |>
  tidyr::separate_wider_delim(
    phrase, delim = "\t", names = c("s1", "s2")
  )

ret <- ret |>
  dplyr::mutate(
    wrd = wrd(conn, s1, s2)
  )

arrow::write_parquet(ret, "tanka72-wrd.parquet")

### tweets ----
tweets <-
  readr::read_csv(
    "data/shinabitanori-230622.csv.gz",
    col_names = c("id", "time", "body"),
    col_types = "ccc"
  ) |>
  dplyr::filter(
    !stringr::str_detect(body, "@|(RT)"),
    stringr::str_length(body) > 14,
    stringr::str_length(body) < 30
  ) |>
  dplyr::slice_sample(n = 100) |>
  dplyr::transmute(
    id = factor(id),
    body = audubon::strj_normalize(body) |>
      ldccr::clean_url() |>
      stringr::str_remove_all("[^[:alnum:]]+")
  ) |>
  dplyr::filter(
    !stringi::stri_isempty(body)
  )

tweets <- tweets |>
  dplyr::reframe(
    id = id,
    phrase = split_kugire(body),
    .by = id
  ) |>
  tidyr::separate_wider_delim(
    phrase, delim = "\t", names = c("s1", "s2")
  )

tweets <- tweets |>
  dplyr::mutate(
    wrd = wrd(conn, s1, s2)
  )

arrow::write_parquet(tweets, "shinabitanori-wrd.parquet")


close(conn)
rm(conn)

## poem-score2.R
require(sudachir)
require(apportita)

strj_split_boundaries <- \(x) {
  stringi::stri_split_boundaries(
    x,
    opts_brkiter = stringi::stri_opts_brkiter(
      locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere
    )
  )
}

split_kugire <- \(x) {
  strj_split_boundaries(x) |>
    purrr::map(\(elem) {
      len <- length(elem)
      if (len < 2) {
        return(NA_character_)
      } else {
        sapply(seq_len(len - 1), \(i) {
          s1 <- paste0(elem[1:i], collapse = "")
          s2 <- paste0(elem[(i + 1):len], collapse = "")
          paste(s1, s2, sep = "\t")
        })
      }
    }) |>
    unlist()
}

sudachi <- sudachir::rebuild_tokenizer(mode = "C")
form <- \(x) {
  unlist(sudachir::form(x, type = "normalized", pos = FALSE,
                        instance = sudachi))
}
wrd <- \(conn, s1, s2) {
  purrr::map2_dbl(s1, s2, \(el1, el2) {
    el1 <- form(el1)
    el2 <- form(el2)
    tryCatch(
      apportita::calc_wrd(conn, el1, el2),
      error = \(e) {
        NA
      }
    )
  }, .progress = TRUE)
}

conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude")
dim(conn)


### tanka ----
dat <-
  readxl::read_excel(
    "data/tanka/1001-1500_poems.xlsx",
    col_names = c("id", "poems", "names", "loves", "likes", "keys1", "keys2")
  ) |>
  # dplyr::slice_sample(n = 20) |>
  dplyr::transmute(
    id = id,
    names = stringr::str_remove_all(names, "[(<U\\+[A-Z]>)]") |>
      audubon::strj_normalize() |>
      stringr::str_remove_all("[^[:alnum:]]+"),
    body = stringr::str_remove_all(poems, "[(<U\\+[A-Z]>)]") |>
      audubon::strj_normalize() |>
      stringr::str_remove_all("[^[:alnum:]]+")
  ) |>
  dplyr::filter(
    !stringi::stri_isempty(body)
  )

dat <- dat |>
  dplyr::reframe(
    id = id,
    phrase = split_kugire(body),
    .by = id
  ) |>
  tidyr::separate_wider_delim(
    phrase, delim = "\t", names = c("s1", "s2")
  )

arrow::write_parquet(dat, "tanka-kugire.parquet")
print("区切り処理終わり")

dat <- dat |>
  tidyr::drop_na() |>
  dplyr::mutate(
    wrd = wrd(conn, s1, s2)
  )

arrow::write_parquet(dat, "tanka-wrd.parquet")


close(conn)
rm(conn)

## qiita-download.R
clean_url <- \(text) {
  pat <- "(https?|ftp)://([[a-zA-Z0-9]-]+\\.)+[[a-zA-Z0-9]-]+(/[[a-zA-Z0-9]- ./?%&=~]*)?"
  stringr::str_replace_all(text, pattern = pat, replacement = "URL")
}

to_df <- \(jsonlist) {
  purrr::map(jsonlist, \(elem) {
    text <- elem$rendered_body |>
      rvest::read_html() |>
      rvest::html_elements("p") |>
      rvest::html_text() |>
      paste(collapse = "\n")
    tags <- elem$tags |>
      purrr::map_chr(~ .x$name) |>
      paste(collapse = ",")

    tibble::tibble(
      "doc_id" = elem$id,
      "created" = elem$created_at,
      "updated" = elem$updated_at,
      "author" = elem$user$permanent_id,
      "title" = audubon::strj_normalize(elem$title),
      "text" = audubon::strj_normalize(text),
      "tags" = tags,
      "comments" = elem$comments_count,
      "likes" = elem$likes_count,
      "reactions" = elem$reactions_count,
      "stocks" = elem$stocks_count)
  }) |>
    purrr::list_rbind() |>
    dplyr::filter(
      cld3::detect_language(text) == "ja"
    ) |>
    dplyr::mutate(
      created = lubridate::as_date(created),
      updated = lubridate::as_date(updated)
    )
}
PAGES <- ceiling(4642 / 100)

df <- seq_len(PAGES) |>
  purrr::map(\(i) {
    li <- qiitr::qiita_get_items(
      tag_id = "r",
      per_page = 100, # range: 1-100
      page_offset = (i - 1), # 1 + page_offset < 100
      page_limit = 1
    )
    if (i %% 16 == 0) {
      rlang::inform(sprintf("Sleeping 16 minutes; currently took %dth page.", i))
      Sys.sleep(60 * 16)
    }
    return(to_df(li))
  }, .progress = TRUE) |>
  purrr::list_rbind()

arrow::write_parquet(df, here::here("data/qiita.parquet"))

## quanteda.R
tbl <- readxl::read_xlsx("Downloads/koe.xlsx") |>
  tibble::rowid_to_column(var = "doc_id") |>
  dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "NA"))) |>
  tidyr::drop_na()

tbl |>
  dplyr::select(Region, Sex, Age, Satis) |>
  DataExplorer::plot_bar()

tbl <- tbl |>
  dplyr::mutate(
    doc_id = factor(doc_id),
    Region = factor(Region),
    Sex = factor(Sex),
    Age = stringi::stri_trans_nfkc(Age),
    Age = dplyr::case_match(
      Age,
      c("10代", "20代") ~ "20",
      "30代" ~ "30",
      "40代" ~ "40",
      "50代" ~ "50",
      c("60代", "70代") ~ "60"
    ),
    Age = factor(Age),
    Satis = stringr::str_detect(Satis, "満足"),
    Opinion = audubon::strj_normalize(Opinion) |>
      stringr::str_replace_all("[[:number:]]+", "N")
  )

summary(tbl)

df <- tbl |>
  dplyr::select(!Region) |>
  gibasa::tokenize(Opinion) |>
  gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |>
  dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |>
  gibasa::mute_tokens(
    (!POS1 %in% c("名詞", "形容詞")) |
    (token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行"))
  ) |>
  gibasa::pack()

corp <- df |>
  dplyr::right_join(
    tbl |>
      dplyr::mutate(Attr = paste(Age, Sex, sep = "_"), .keep = "unused"),
    by = "doc_id"
  ) |>
  quanteda::corpus()

require(ca)

cam <- corp |>
  quanteda::tokens(what = "fastestword") |>
  quanteda::dfm() |>
  quanteda::dfm_group(groups = quanteda::docvars(corp, "Attr")) |>
  quanteda::dfm_trim(min_termfreq = 10, max_termfreq = 50) |>
  quanteda.textmodels::textmodel_ca()

plot(cam)

require(ggplot2)

dplyr::bind_rows(
  as.data.frame(cam$rowcoord),
  as.data.frame(cam$colcoord)
) |>
  tibble::rownames_to_column() |>
  dplyr::mutate(shape = stringr::str_detect(rowname, "_")) |>
  ggplot(aes(Dim1, Dim2, shape = shape, colour = shape)) +
  geom_vline(xintercept = 0, colour = "grey", linetype = "dashed") +
  geom_hline(yintercept = 0, colour = "grey", linetype = "dashed") +
  geom_point() +
  ggrepel::geom_text_repel(aes(label = rowname), max.overlaps = 20) +
  theme_light() +
  theme(legend.position = "none")

df <- tbl |>
  dplyr::select(!Region) |>
  gibasa::tokenize(Opinion) |>
  gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |>
  dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |>
  gibasa::mute_tokens(
    (!POS1 %in% c("名詞", "形容詞")) |
      (token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行"))
  ) |>
  dtplyr::lazy_dt() |>
  dplyr::group_by(doc_id) |>
  dplyr::mutate(term1 = token, term2 = dplyr::lead(term1)) |>
  dplyr::add_count(term1, term2, name = "cooc") |>
  dplyr::ungroup() |>
  dplyr::select(doc_id, token, term1, term2, cooc)

biterms <- df |>
  dplyr::select(doc_id, term1, term2, cooc) |>
  tidyr::drop_na() |>
  dplyr::as_tibble()

model <- df |>
  dplyr::select(doc_id, token) |>
  dplyr::as_tibble() |>
  BTM::BTM(k = 5, background = TRUE, biterms = biterms)

plot(model, top_n = 20)

## R-benchmark-25.R
# R Benchmark 2.5 (06/2008) [Simon Urbanek]
# version 2.5: scaled to get roughly 1s per test, R 2.7.0 @ 2.6GHz Mac Pro
# R Benchmark 2.4 (06/2008) [Simon Urbanek]
# version 2.4 adapted to more recent Matrix package
# R Benchmark 2.3 (21 April 2004)
# Warning: changes are not carefully checked yet!
# version 2.3 adapted to R 1.9.0
# Many thanks to Douglas Bates (bates@stat.wisc.edu) for improvements!
# version 2.2 adapted to R 1.8.0
# version 2.1 adapted to R 1.7.0
# version 2, scaled to get 1 +/- 0.1 sec with R 1.6.2
# using the standard ATLAS library (Rblas.dll)
# on a Pentium IV 1.6 Ghz with 1 Gb Ram on Win XP pro

# revised and optimized for R v. 1.5.x, 8 June 2002
# Requires additionnal libraries: Matrix, SuppDists
# Author : Philippe Grosjean
# eMail  : phgrosjean@sciviews.org
# Web    : http://www.sciviews.org
# License: GPL 2 or above at your convenience (see: http://www.gnu.org)
#
# Several tests are adapted from the Splus Benchmark Test V. 2
# by Stephan Steinhaus (stst@informatik.uni-frankfurt.de)
# Reference for Escoufier's equivalents vectors (test III.5):
# Escoufier Y., 1970. Echantillonnage dans une population de variables
# aleatoires r?elles. Publ. Inst. Statis. Univ. Paris 19 Fasc 4, 1-47.
#
# type source("c:/<dir>/R2.R") to start the test

runs <- 3			# Number of times the tests are executed
times <- rep(0, 15); dim(times) <- c(5,3)
require(Matrix)		# Optimized matrix operations
#require(SuppDists)	# Optimized random number generators
#Runif <- rMWC1019	# The fast uniform number generator
Runif <- runif
# If you don't have SuppDists, you can use: Runif <- runif
#a <- rMWC1019(10, new.start=TRUE, seed=492166)	# Init. the generator
#Rnorm <- rziggurat	# The fast normal number generator
# If you don't have SuppDists, you can use: Rnorm <- rnorm
#b <- rziggurat(10, new.start=TRUE)	# Init. the generator
Rnorm <- rnorm
#remove("a", "b")
options(object.size=100000000)

cat("\n\n   R Benchmark 2.5\n")
cat("   ===============\n")
cat(c("Number of times each test is run__________________________: ", runs))
cat("\n\n")


cat("   I. Matrix calculation\n")
cat("   ---------------------\n")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (1)
cumulate <- 0; a <- 0; b <- 0
for (i in 1:runs) {
  invisible(gc())
  timing <- system.time({
    a <- matrix(Rnorm(2500*2500)/10, ncol=2500, nrow=2500);
    b <- t(a);
    dim(b) <- c(1250, 5000);
    a <- t(b)
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[1, 1] <- timing
cat(c("Creation, transp., deformation of a 2500x2500 matrix (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (2)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  a <- abs(matrix(Rnorm(2500*2500)/2, ncol=2500, nrow=2500));
  invisible(gc())
  timing <- system.time({
    b <- a^1000
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[2, 1] <- timing
cat(c("2400x2400 normal distributed random matrix ^1000____ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (3)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  a <- Rnorm(7000000)
  invisible(gc())
  timing <- system.time({
    b <- sort(a, method="quick")	# Sort is modified in v. 1.5.x
    # And there is now a quick method that better competes with other packages!!!
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[3, 1] <- timing
cat(c("Sorting of 7,000,000 random values__________________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (4)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  a <- Rnorm(2800*2800); dim(a) <- c(2800, 2800)
  invisible(gc())
  timing <- system.time({
    b <- crossprod(a)		# equivalent to: b <- t(a) %*% a
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[4, 1] <- timing
cat(c("2800x2800 cross-product matrix (b = a' * a)_________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (5)
cumulate <- 0; c <- 0; qra <-0
for (i in 1:runs) {
  a <- new("dgeMatrix", x = Rnorm(2000*2000), Dim = as.integer(c(2000,2000)))
  b <- as.double(1:2000)
  invisible(gc())
  timing <- system.time({
    c <- solve(crossprod(a), crossprod(a,b))
  })[3]
  cumulate <- cumulate + timing

  # This is the old method
  #a <- Rnorm(600*600); dim(a) <- c(600,600)
  #b <- 1:600
  #invisible(gc())
  #timing <- system.time({
  #  qra <- qr(a, tol = 1e-7);
  #  c <- qr.coef(qra, b)
  #  #Rem: a little faster than c <- lsfit(a, b, inter=F)$coefficients
  #})[3]
  #cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[5, 1] <- timing
cat(c("Linear regr. over a 3000x3000 matrix (c = a \\ b')___ (sec): ", timing, "\n"))
remove("a", "b", "c", "qra")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

times[ , 1] <- sort(times[ , 1])
cat("                      --------------------------------------------\n")
cat(c("                 Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 1]))), "\n\n"))

cat("   II. Matrix functions\n")
cat("   --------------------\n")
if (R.Version()$os == "Win32") flush.console()

# (1)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  a <- Rnorm(2400000)
  invisible(gc())
  timing <- system.time({
    b <- fft(a)
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[1, 2] <- timing
cat(c("FFT over 2,400,000 random values____________________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (2)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  a <- array(Rnorm(600*600), dim = c(600, 600))
  # Only needed if using eigen.Matrix(): Matrix.class(a)
  invisible(gc())
  timing <- system.time({
  	b <- eigen(a, symmetric=FALSE, only.values=TRUE)$Value
  	# Rem: on my machine, it is faster than:
  #	 b <- La.eigen(a, symmetric=F, only.values=T, method="dsyevr")$Value
  #	 b <- La.eigen(a, symmetric=F, only.values=T, method="dsyev")$Value
  #  b <- eigen.Matrix(a, vectors = F)$Value
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[2, 2] <- timing
cat(c("Eigenvalues of a 640x640 random matrix______________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (3)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  a <- Rnorm(2500*2500); dim(a) <- c(2500, 2500)
  #Matrix.class(a)
  invisible(gc())
  timing <- system.time({
    #b <- determinant(a, logarithm=F)
    # Rem: the following is slower on my computer!
    # b <- det.default(a)
    b <- det(a)
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[3, 2] <- timing
cat(c("Determinant of a 2500x2500 random matrix____________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (4)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  a <- crossprod(new("dgeMatrix", x = Rnorm(3000*3000),
                       Dim = as.integer(c(3000, 3000))))
  invisible(gc())
  #a <- Rnorm(900*900); dim(a) <- c(900, 900)
  #a <- crossprod(a, a)
  timing <- system.time({
    b <- chol(a)
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[4, 2] <- timing
cat(c("Cholesky decomposition of a 3000x3000 matrix________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (5)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  a <- new("dgeMatrix", x = Rnorm(1600*1600), Dim = as.integer(c(1600, 1600)))
  invisible(gc())
  #a <- Rnorm(400*400); dim(a) <- c(400, 400)
  timing <- system.time({
  #  b <- qr.solve(a)
    # Rem: a little faster than
    b <- solve(a)
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[5, 2] <- timing
cat(c("Inverse of a 1600x1600 random matrix________________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

times[ , 2] <- sort(times[ , 2])
cat("                      --------------------------------------------\n")
cat(c("                Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 2]))), "\n\n"))

cat("   III. Programmation\n")
cat("   ------------------\n")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (1)
cumulate <- 0; a <- 0; b <- 0; phi <- 1.6180339887498949
for (i in 1:runs) {
  a <- floor(Runif(3500000)*1000)
  invisible(gc())
  timing <- system.time({
    b <- (phi^a - (-phi)^(-a))/sqrt(5)
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[1, 3] <- timing
cat(c("3,500,000 Fibonacci numbers calculation (vector calc)(sec): ", timing, "\n"))
remove("a", "b", "phi")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (2)
cumulate <- 0; a <- 3000; b <- 0
for (i in 1:runs) {
  invisible(gc())
  timing <- system.time({
    b <- rep(1:a, a); dim(b) <- c(a, a);
    b <- 1 / (t(b) + 0:(a-1))
    # Rem: this is twice as fast as the following code proposed by R programmers
    # a <- 1:a; b <- 1 / outer(a - 1, a, "+")
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[2, 3] <- timing
cat(c("Creation of a 3000x3000 Hilbert matrix (matrix calc) (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (3)
cumulate <- 0; c <- 0
gcd2 <- function(x, y) {if (sum(y > 1.0E-4) == 0) x else {y[y == 0] <- x[y == 0]; Recall(y, x %% y)}}
for (i in 1:runs) {
  a <- ceiling(Runif(400000)*1000)
  b <- ceiling(Runif(400000)*1000)
  invisible(gc())
  timing <- system.time({
    c <- gcd2(a, b)                            # gcd2 is a recursive function
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[3, 3] <- timing
cat(c("Grand common divisors of 400,000 pairs (recursion)__ (sec): ", timing, "\n"))
remove("a", "b", "c", "gcd2")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (4)
cumulate <- 0; b <- 0
for (i in 1:runs) {
  b <- rep(0, 500*500); dim(b) <- c(500, 500)
  invisible(gc())
  timing <- system.time({
  	# Rem: there are faster ways to do this
  	# but here we want to time loops (220*220 'for' loops)!
    for (j in 1:500) {
      for (k in 1:500) {
        b[k,j] <- abs(j - k) + 1
      }
    }
  })[3]
  cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[4, 3] <- timing
cat(c("Creation of a 500x500 Toeplitz matrix (loops)_______ (sec): ", timing, "\n"))
remove("b", "j", "k")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

# (5)
cumulate <- 0; p <- 0; vt <- 0; vr <- 0; vrt <- 0; rvt <- 0; RV <- 0; j <- 0; k <- 0;
x2 <- 0; R <- 0; Rxx <- 0; Ryy <- 0; Rxy <- 0; Ryx <- 0; Rvmax <- 0
# Calculate the trace of a matrix (sum of its diagonal elements)
Trace <- function(y) {sum(c(y)[1 + 0:(min(dim(y)) - 1) * (dim(y)[1] + 1)], na.rm=FALSE)}
for (i in 1:runs) {
  x <- abs(Rnorm(45*45)); dim(x) <- c(45, 45)
  invisible(gc())
  timing <- system.time({
    # Calculation of Escoufier's equivalent vectors
    p <- ncol(x)
    vt <- 1:p                                  # Variables to test
    vr <- NULL                                 # Result: ordered variables
    RV <- 1:p                                  # Result: correlations
    vrt <- NULL
    for (j in 1:p) {                           # loop on the variable number
      Rvmax <- 0
      for (k in 1:(p-j+1)) {                   # loop on the variables
        x2 <- cbind(x, x[,vr], x[,vt[k]])
        R <- cor(x2)                           # Correlations table
        Ryy <- R[1:p, 1:p]
        Rxx <- R[(p+1):(p+j), (p+1):(p+j)]
        Rxy <- R[(p+1):(p+j), 1:p]
        Ryx <- t(Rxy)
        rvt <- Trace(Ryx %*% Rxy) / sqrt(Trace(Ryy %*% Ryy) * Trace(Rxx %*% Rxx)) # RV calculation
        if (rvt > Rvmax) {
          Rvmax <- rvt                         # test of RV
          vrt <- vt[k]                         # temporary held variable
        }
      }
      vr[j] <- vrt                             # Result: variable
      RV[j] <- Rvmax                           # Result: correlation
      vt <- vt[vt!=vr[j]]                      # reidentify variables to test
    }
  })[3]
  cumulate <- cumulate + timing
}
times[5, 3] <- timing
cat(c("Escoufier's method on a 45x45 matrix (mixed)________ (sec): ", timing, "\n"))
remove("x", "p", "vt", "vr", "vrt", "rvt", "RV", "j", "k")
remove("x2", "R", "Rxx", "Ryy", "Rxy", "Ryx", "Rvmax", "Trace")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

times[ , 3] <- sort(times[ , 3])
cat("                      --------------------------------------------\n")
cat(c("                Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 3]))), "\n\n\n"))

cat(c("Total time for all 15 tests_________________________ (sec): ", sum(times), "\n"))
cat(c("Overall mean (sum of I, II and III trimmed means/3)_ (sec): ", exp(mean(log(times[2:4, ]))), "\n"))
remove("cumulate", "timing", "times", "runs", "i")
cat("                      --- End of test ---\n\n")

## tidymodels-demo.R
suppressPackageStartupMessages({
  require(tidymodels)
})

ames_split <- initial_split(modeldata::ames, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test <- testing(ames_split)

ames_rec <- recipe(Sale_Price ~ ., data = ames_train) |>
  step_mutate(Sale_Price = log1p(Sale_Price)) |>
  step_nzv(all_predictors()) |>
  step_select(
    all_outcomes(),
    all_numeric_predictors(),
    !starts_with("Year"), !ends_with("tude")
  ) |>
  step_normalize(all_numeric_predictors())

# injection
# https://parsnip.tidymodels.org/reference/model_spec.html
num_trees <- 1200L

ames_spec <- rand_forest(mtry = tune(), trees = !!num_trees) |>
  set_mode("regression") |>
  set_engine("ranger") |>
  set_args(num.threads = !!parallelly::availableCores(omit = 1), seed = 1234)

str(ames_spec)

base_wf <-
  workflow() |>
  add_recipe(ames_rec) |>
  add_model(
    rand_forest(trees = !!num_trees) |>
      set_engine("ranger") |>
      set_mode("regression") |>
      set_args(num.threads = !!parallelly::availableCores(omit = 1)),
    formula = expm1(Sale_Price) ~ .
  ) |>
  fit(ames_train)

augment(
  extract_fit_parsnip(base_wf),
  new_data = prep(ames_rec) |> bake(new_data = ames_test)
) |>
  dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す
  metrics(truth = Sale_Price, estimate = .pred)


ames_grid <- workflow() |>
  add_recipe(ames_rec) |>
  add_model(ames_spec, formula = Sale_Price ~ .) |>
  tune_grid(
    resamples = vfold_cv(ames_train, v = 3),
    grid = grid_latin_hypercube(
      extract_parameter_set_dials(ames_spec) |>
        finalize(prep(ames_rec) |> bake(new_data = NULL)),
      size = 10
    ),
    control = control_grid(verbose = FALSE),
    metrics = metric_set(rmse)
  )

select_best(ames_grid)
show_best(ames_grid)

best_wf <-
  workflow() |>
  add_recipe(ames_rec) |>
  add_model(ames_spec, formula = expm1(Sale_Price) ~ .) |>
  finalize_workflow(select_best(ames_grid))

best_fit <- last_fit(best_wf, ames_split)

collect_predictions(best_fit)

collect_predictions(best_fit) |>
  dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す
  metrics(truth = Sale_Price, estimate = .pred)

# model formula
# https://github.com/tidymodels/parsnip/blob/8f13c1c41ce603261f25af64694c253f618fa999/R/model_formula.R
linear_reg(penalty = 1.0) |>
  set_engine("glmnet") |>
  set_mode("regression")


### baritsu
suppressPackageStartupMessages({
  require(tidymodels)
  require(baritsu)
})

#### classification

data("penguins", package = "modeldata")
data_split <- initial_split(penguins, strata = species, prop = .7)
penguins_train <- training(data_split)
rec <-
  recipe(
    species ~ .,
    data = penguins_train
  ) |>
  step_impute_mode(all_nominal()) |>
  step_impute_median(all_numeric_predictors()) |>
  step_zv(all_numeric_predictors()) |>
  step_scale(all_numeric_predictors()) |>
  step_dummy(all_nominal_predictors())

spec <- svm_linear(
  margin = tune()
) |>
  set_engine("baritsu", penalty = tune()) |>
  set_mode("classification")

wf <- workflow() |>
  add_recipe(rec) |>
  add_model(spec)

wf_fit <- wf |>
  tune_grid(
    resamples = vfold_cv(penguins_train, v = 5, strata = species),
    grid = grid_max_entropy(
      svm_margin(),
      penalty(range = c(-1, 0)),
      size = 5
    ),
    metrics = metric_set(f_meas),
    control = control_grid(verbose = FALSE)
  )

show_best(wf_fit)

best_wf_fit <- wf |>
  finalize_workflow(select_best(wf_fit)) |>
  last_fit(data_split, metrics = metric_set(f_meas))

collect_metrics(best_wf_fit)


#### regression

pkgload::load_all(export_all = FALSE)

ames <- modeldata::ames |>
  dplyr::slice_sample(n = 100) |>
  dplyr::select(Sale_Price, Lot_Area, Total_Bsmt_SF, First_Flr_SF)
data_split <- rsample::initial_split(ames)
ames_train <- rsample::training(data_split)
ames_test <- rsample::testing(data_split)
rec <-
  recipes::recipe(
    Sale_Price ~ .,
    data = ames_train
  ) |>
  recipes::step_scale(recipes::all_numeric_predictors())

spec <- lars(
    penalty_L1 = tune::tune(),
    penalty_L2 = tune::tune()
  ) |>
    parsnip::set_engine("baritsu") |>
    parsnip::set_mode("regression")

wf <-
  workflows::workflow() |>
  workflows::add_recipe(rec) |>
  workflows::add_model(spec)

wf_fit <- wf |>
  tune::tune_grid(
    resamples = rsample::vfold_cv(ames_train, v = 5),
    grid = dials::grid_max_entropy(
      dials::penalty_L1(range = c(-10, 0)),
      dials::penalty_L2(range = c(-10, 0)),
      size = 10
    ),
    metrics = yardstick::metric_set(yardstick::rmse),
    control = tune::control_grid(verbose = TRUE)
  )


penguins <- modeldata::penguins
data_split <- rsample::initial_split(penguins, strata = "species")
penguins_train <- rsample::training(data_split)
penguins_test <- rsample::testing(data_split)
rec <-
  recipes::recipe(
    species ~ .,
    data = penguins_train
  ) |>
  # recipes::step_select(recipes::all_outcomes(), recipes::all_numeric_predictors()) |>
  recipes::step_impute_mode(recipes::all_nominal()) |>
  recipes::step_impute_median(recipes::all_numeric_predictors()) |>
  recipes::step_scale(recipes::all_numeric_predictors())

my_dat <- recipes::prep(rec) |> recipes::bake(new_data = NULL)
my_dat2 <- recipes::prep(rec) |> recipes::bake(new_data = penguins_test)

wf_fit <- wf |>
  tune::tune_grid(
    resamples = rsample::vfold_cv(penguins_train, v = 3),
    grid = dials::grid_max_entropy(
      dials::penalty(),
      size = 5
    ),
    metrics = yardstick::metric_set(yardstick::f_meas),
    control = tune::control_grid(verbose = TRUE)
  )

## whisper.R
require(audio.whisper)

model <- whisper("models/ggml-base.bin")
trans <- predict(model, "output.wav", language = "ja")

arrow::write_parquet(trans$tokens, "output.wav.parquet")
	akita <- jisx0402::municipality \|>
	dplyr::filter(
	pref_code == "05",
	is.na(end_date),
	!stringr::str_ends(name, "郡")
	) \|>
	dplyr::mutate(
	muni_code = paste0(pref_code, city_code),
	muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
	start_year = lubridate::year(start_date)
	) \|>
	dplyr::select(muni_code, name, start_year) \|>
	dplyr::arrange(desc(start_year))

	year <- max(akita$start_year, na.rm = TRUE)

	dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") \|>
	dplyr::filter(.data$year >= .env$year) \|>
	dplyr::right_join(akita, by = c("code" = "muni_code"))

	require(ggplot2)

	dat \|>
	dplyr::filter(name == "秋田市", age == "all") \|>
	ggplot(aes(year, population)) +
	geom_line(aes(colour = sex)) +
	scale_y_log10()

	ga <- jisx0402::jptopography("designated") \|>
	dplyr::filter(stringr::str_starts(muni_code, "05")) \|>
	rlang::as_function(
	~ dplyr::mutate(.,
	muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
	area = units::set_units(sf::st_area(.), km^2)
	)
	)() \|>
	dplyr::right_join(
	dplyr::filter(dat, age == "all"),
	by = c("muni_code" = "code")
	) \|>
	dplyr::group_by(sex) \|>
	dplyr::mutate(
	denst = round(population / area),
	rank = dplyr::percent_rank(denst) * 10
	) \|>
	dplyr::ungroup() \|>
	ggplot() +
	geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
	facet_wrap(~ sex) +
	labs(
	title = "Rank of Population Density in Akita, throughout 2006-2023",
	subtitle = "Year: {frame_time}",
	caption = paste(
	"地図データ：「国土数値情報行政区域データ」（国土交通省）",
	"https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
	sep = "\n"
	)
	) +
	theme_light() +
	gganimate::transition_time(year)

	gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
	gganimate::anim_save("akita-denst1.mp4", path = "Videos")
	tokyo <- jisx0402::municipality \|>
	dplyr::filter(
	pref_code == "13",
	is.na(end_date),
	!stringr::str_ends(name, "郡\|(特別区)\|(支庁)")
	) \|>
	dplyr::mutate(
	muni_code = paste0(pref_code, city_code),
	muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
	start_year = lubridate::year(start_date)
	) \|>
	dplyr::select(muni_code, name, start_year) \|>
	dplyr::arrange(desc(start_year))

	year <- max(tokyo$start_year, na.rm = TRUE)

	dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") \|>
	dplyr::filter(.data$year >= .env$year) \|>
	dplyr::right_join(tokyo, by = c("code" = "muni_code"))

	require(ggplot2)

	dat \|>
	dplyr::filter(age == "all", stringr::str_detect(name, "区"),
	year > 2005L) \|>
	ggplot(aes(year, population)) +
	geom_line(aes(colour = sex)) +
	facet_wrap(~ name) +
	scale_y_log10()

	ga <- jisx0402::jptopography("all", resolution = 1) \|>
	dplyr::filter(stringr::str_starts(muni_code, "13")) \|>
	rlang::as_function(
	~ dplyr::mutate(.,
	muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
	area = units::set_units(sf::st_area(.), km^2)
	)
	)() \|>
	dplyr::right_join(
	dplyr::filter(dat, age == "all", stringr::str_detect(name, "区")),
	by = c("muni_code" = "code")
	) \|>
	dplyr::group_by(sex) \|>
	dplyr::mutate(
	denst = round(population / area),
	rank = dplyr::percent_rank(denst) * 10
	) \|>
	dplyr::ungroup() \|>
	ggplot() +
	geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
	facet_wrap(~ sex) +
	labs(
	title = "Rank of Population Density in Tokyo Special Wards, throughout 2006-2023",
	subtitle = "Year: {frame_time}",
	caption = paste(
	"地図データ：「国土数値情報行政区域データ」（国土交通省）",
	"https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
	sep = "\n"
	)
	) +
	theme_light() +
	gganimate::transition_time(year, range = c(2006L, 2023L))

	gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
	gganimate::anim_save("tokyo23-denst1.mp4", path = "Videos")
	#' @export
	accelerate <- function(text = "精神を加速させろ") {
	r"(
	∩＿∩
	／＼／＼
	\|（ﾟ）=（ﾟ）\|
	\|　　●_●　 \|
	/　　　　　　ヽ
	r⌒\| 〃 ------ ヾ　\|
	/　　i／　\|＿二＿＿ノ
	./　／　　/　　　　　　) 　　　 {text}
	./ ／　　／　　　　　/／
	/　　　.／　　　　　/￣
	.ヽ､__.／　　　　／ ⌒ヽ
	r　　　／　　　　\|
	/　　　　　　　　　ﾉ
	/　　　　　 /　　　 /
	./　　　　／/　　／
	/.　　　.／ ./　／
	i　　　／　./ ／
	i　　./　.ノ.＾/
	i　 ./　　\|_／
	i　/
	／ /
	(_／
	)" \|> stringi::stri_replace_all_fixed("　", " ") \|> glue::glue()
	}

	#' @export
	qb <- function(text = "わけがわからないよ") {
	r"(
	\|＼　　　　　　／\|
	\|＼＼　　　　／／\|
	: 　,>　｀´￣｀´　<　 ′
	.　　　　　Ｖ　　　　　　Ｖ
	.　　　　　 i<　●　　　 ●　>i
	八　　､_,_, 　　八　　　{text}
	. 　　　 /　个 . ＿　＿ . 个 ',
	＿/ 　 il 　 ,'　　　 '.　 li　 ',＿_
	)" \|> stringi::stri_replace_all_fixed("　", " ") \|> glue::glue()
	}
	#!/usr/bin/env Rscript

	ID <- commandArgs(trailingOnly = TRUE)[1]
	TYPE <- commandArgs(trailingOnly = TRUE)[2]

	aznyan::get_lyrics_list(ID, ifelse(!is.na(TYPE), TYPE)) \|>
	aznyan::get_lyrics(paste0(ID, ".csv"))

	dat <-
	readr::read_csv("data/lower_care.csv") \|>
	dplyr::mutate(
	issue = factor(issue),
	speaker = factor(speaker),
	speakerGroup = factor(speakerGroup),
	speech = stringi::stri_replace_all_regex(speech, "\\s+", "") \|>
	stringi::stri_trans_nfkc()
	)

	# 名詞
	toks <- dat \|>
	dplyr::select(speechID, speech, speaker, speakerGroup) \|>
	gibasa::tokenize(
	speech,
	speechID,
	sys_dic = here::here("sudachidict/build")
	) \|>
	gibasa::prettify(
	into = gibasa::get_dict_features("sudachi"),
	col_select = c("POS1", "normalized_form", "reading_form")
	) \|>
	dplyr::mutate(
	reading_form = stringi::stri_replace_all_regex(
	reading_form,
	"[ァィゥェォャュョ]",
	""
	),
	mora = stringi::stri_length(reading_form),
	.by = doc_id
	)

	noun <- toks \|>
	dplyr::reframe(
	tok = gibasa::ngram_tokenizer(3)(token, sep = "-"),
	pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"),
	.by = doc_id
	) \|>
	dplyr::filter(
	pos == "名詞-助詞-名詞",
	stringi::stri_detect_fixed(tok, "-の-")
	) \|>
	dplyr::distinct(tok) \|>
	dplyr::pull(tok) \|>
	stringi::stri_extract_first_regex("^([[:alpha:]]+)")

	meishi <- toks \|>
	dplyr::count(doc_id, token, mora) \|>
	tidytext::bind_tf_idf(token, doc_id, n) \|>
	dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) \|>
	dplyr::filter(dplyr::percent_rank(tf_idf) > .80) \|>
	dplyr::distinct(token, mora)

	readr::write_csv(
	meishi,
	"vocab_n.csv"
	)

	# 述部
	ch <-
	stringi::stri_split_boundaries(
	dat$speech,
	opts_brkiter = stringi::stri_opts_brkiter(
	locale = "ja@ld=auto;lw=phrase"
	)
	) \|>
	purrr::map(
	~ stringi::stri_subset_regex(., "。")
	) \|>
	unlist() \|>
	unique() \|>
	stringi::stri_replace_all_fixed("。", "") \|>
	purrr::keep(~ stringi::stri_length(.) > 2)

	len <-
	gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) \|>
	gibasa::prettify(
	into = gibasa::get_dict_features("sudachi"),
	col_select = "reading_form"
	) \|>
	gibasa::as_tokens(reading_form, pos_field = NULL) \|>
	purrr::map(
	~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") \|>
	stringi::stri_length() \|>
	sum()
	) \|>
	unlist()

	jutsubu <-
	tibble::tibble(
	word = ch,
	mora = len
	) \|>
	dplyr::filter(
	!gibasa::is_blank(word),
	mora > 4, mora < 8
	) \|>
	dplyr::arrange(mora)

	readr::write_csv(
	jutsubu,
	"vocab_j.csv"
	)

	# 修飾部
	shushoku <- toks \|>
	dplyr::filter(
	POS1 %in% c("形容詞")
	) \|>
	dplyr::distinct(normalized_form, .keep_all = TRUE) \|>
	dplyr::select(token, mora) \|>
	dplyr::arrange(mora)

	readr::write_csv(
	shushoku,
	"vocab_s.csv"
	)

	dat <-
	readxl::read_xlsx(
	"data/39.xlsx",
	col_names = c(
	"title", "artist", "lyricist", "composer",
	"leading", "link", "source", "text",
	"release", "view"
	),
	na = "NA"
	) \|>
	dplyr::reframe(
	text = stringi::stri_split_regex(text, "[\\s]{2,}") \|>
	unlist() \|>
	stringi::stri_replace_all_regex("[\\n]+", "。") \|>
	stringi::stri_replace_all_regex("[「」（）\\(\\)]", "") \|>
	stringi::stri_trans_tolower(),
	.by = link
	)

	# 名詞
	toks <- dat \|>
	gibasa::tokenize(
	text,
	link,
	sys_dic = here::here("sudachidict/build")
	) \|>
	gibasa::prettify(
	into = gibasa::get_dict_features("sudachi"),
	col_select = c("POS1", "normalized_form", "reading_form")
	) \|>
	dplyr::mutate(
	reading_form = stringi::stri_replace_all_regex(
	reading_form,
	"[ァィゥェォャュョ]",
	""
	),
	mora = stringi::stri_length(reading_form),
	.by = doc_id
	)

	noun <- toks \|>
	dplyr::reframe(
	tok = gibasa::ngram_tokenizer(3)(token, sep = "-"),
	pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"),
	.by = doc_id
	) \|>
	dplyr::filter(
	pos == "名詞-助詞-名詞",
	stringi::stri_detect_fixed(tok, "-の-")
	) \|>
	dplyr::distinct(tok) \|>
	dplyr::pull(tok) \|>
	stringi::stri_extract_first_regex("^([[:alpha:]]+)")

	meishi <- toks \|>
	dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) \|>
	dplyr::distinct(token, mora)

	readr::write_csv(
	meishi,
	"vocab_n.csv"
	)

	# 述部
	ch <-
	stringi::stri_split_boundaries(
	dat$text,
	opts_brkiter = stringi::stri_opts_brkiter(
	locale = "ja@ld=auto;lw=phrase"
	)
	) \|>
	purrr::map(
	~ stringi::stri_subset_regex(., "。")
	) \|>
	unlist() \|>
	unique() \|>
	stringi::stri_replace_all_fixed("。", "") \|>
	purrr::keep(~ stringi::stri_length(.) > 2)

	len <-
	gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) \|>
	gibasa::prettify(
	into = gibasa::get_dict_features("sudachi"),
	col_select = "reading_form"
	) \|>
	gibasa::as_tokens(reading_form, pos_field = NULL) \|>
	purrr::map(
	~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") \|>
	stringi::stri_length() \|>
	sum()
	) \|>
	unlist()

	jutsubu <-
	tibble::tibble(
	word = ch,
	mora = len
	) \|>
	dplyr::filter(
	!gibasa::is_blank(word),
	mora > 2, mora < 8
	) \|>
	dplyr::arrange(mora)

	readr::write_csv(
	jutsubu,
	"vocab_j.csv"
	)

	# 修飾部
	shushoku <- toks \|>
	dplyr::filter(
	POS1 %in% c("形容詞", "副詞")
	) \|>
	dplyr::distinct(normalized_form, .keep_all = TRUE) \|>
	dplyr::select(token, mora) \|>
	dplyr::arrange(mora)

	readr::write_csv(
	shushoku,
	"vocab_s.csv"
	)
	#' Environment for internal use
	#'
	#' @noRd
	#' @keywords internal
	.env <- rlang::env(server i= NULL)

	#' @noRd
	reset_encoding <- function(chr, encoding = "UTF-8") {
	Encoding(chr) <- encoding
	return(chr)
	}

	#' Call lattice command
	#'
	#' A debug tool of tokenize process outputs a lattice in graphviz dot format.
	#'
	#' @param sentences Character vector.
	#' @param ... Other arguments are passed to \code{DiagrammeR::grViz}.
	#'
	#' @export
	get_lattice <- function(sentences, ...) {
	senteces <- stringi::stri_omit_na(sentences)
	dot <- processx::run("kagome", c("lattice", stringi::stri_c(sentences, collapse = " ")))$stdout
	DiagrammeR::grViz(reset_encoding(dot), ...)
	}

	#' Lanch Kagome Server
	#'
	#' Start or kill Kagome server process.
	#' @param dict Dictionary which kagome server uses. Default value is 'ipa' (IPA-dictionary).
	#' @return Logical value (whether or not the kagome server is alive?) is returned invisibly.
	#'
	#' @export
	lanch_server <- function(dict = c("ipa", "uni")) {
	p <- rlang::env_get(.env, "server")
	if (is.null(p)) {
	dict <- rlang::arg_match(dict)
	p <- processx::process$new("kagome", c("server", "-dict", dict))

	rlang::env_bind(.env, server = p)
	} else {
	if (p$is_alive()) {
	kill <- yesno::yesno("Kagome server is already alive. Do you want to kill its process?")
	}
	if (kill) {
	p$kill()
	rlang::env_bind(.env, server = NULL)
	}
	}
	return(invisible(p$is_alive()))
	}

	#' Send a HEAD request to Kagome server
	#'
	#' @param url URL Character scalar.
	#' @return httr2 response is returned invisibly.
	#'
	#' @export
	ping <- function(url = "http://localhost:6060") {
	resp <-
	httr2::request(url) \|>
	httr2::req_method("HEAD") \|>
	httr2::req_perform()
	return(invisible(resp))
	}

	#' Put request to tokenize API
	#'
	#' @param sentences Character vector to be analyzed.
	#' @param url URL of Kagome server.
	#' @param mode One of `normal`, `search` or `extended`.
	#' @return tibble
	#'
	#' @export
	tokenize <- function(sentences,
	url = "http://localhost:6060/tokenize",
	mode = c("normal", "search", "extended")) {
	mode <- rlang::arg_match(mode)

	sentences <-
	stringi::stri_omit_na(sentences) \|>
	stringi::stri_split_boundaries(type = "sentence") \|>
	purrr::flatten_chr()

	resps <-
	furrr::future_imap_dfr(sentences, ~ tokenize_impl(.x, .y, url, mode)) \|>
	dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "*"))) \|>
	dplyr::mutate_at(c("doc_id", "class"), as.factor) \|>
	tibble::as_tibble()

	return(resps)
	}

	#' @noRd
	tokenize_impl <- function(msg, idx, url = "http://localhost:6060/tokenize", mode = "normal") {
	resp <-
	httr2::request(url) \|>
	httr2::req_body_json(list(
	sentence = msg,
	mode = mode
	)) \|>
	httr2::req_method("PUT") \|>
	httr2::req_error(function(resp) httr2::resp_status(resp) > 400) \|>
	httr2::req_perform()

	return(
	purrr::map_dfr(
	httr2::resp_body_json(resp)$tokens,
	~ data.frame(
	doc_id = idx,
	id = .$id,
	start = .$start,
	end = .$end,
	class = .$class,
	token = .$surface,
	POS1 = .$pos[[1]],
	POS2 = .$pos[[2]],
	POS3 = .$pos[[3]],
	POS4 = .$pos[[4]],
	Original = .$base_form,
	Yomi1 = .$reading,
	Yomi2 = .$pronunciation
	)
	)
	)
	}

	.on_unload <- function(ns) {
	p <- rlang::env_get(.env, "server")
	if (!is.null(p) && p$is_alive()) {
	p$kill()
	}
	}
	texts <- ldccr::read_aozora(
	"https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip",
	directory = tempdir()
	)

	sent <- readr::read_lines(texts) \|>
	gibasa::tokenize(split = TRUE) \|>
	gibasa::prettify() \|>
	dplyr::filter(!POS1 %in% c("記号")) \|>
	dplyr::group_by(doc_id, sentence_id) \|>
	dplyr::reframe(
	text = paste0(token, collapse = " ")
	) \|>
	dplyr::mutate(
	doc_id = doc_id,
	sentence_id = paste(doc_id, sentence_id, sep = "_")
	)

	### lexRankr ----

	tok <- sent \|>
	tidytext::unnest_tokens(
	token,
	text,
	token = \(x) strsplit(x, " ", fixed = TRUE)
	)

	simil_df <- lexRankr::sentenceSimil(
	sentenceId = tok$sentence_id,
	token = tok$token,
	docId = tok$doc_id
	)

	top_n_sent <-
	lexRankr::lexRankFromSimil(
	simil_df$sent1,
	simil_df$sent2,
	simil = simil_df$similVal,
	n = 10,
	usePageRank = TRUE,
	continuous = TRUE,
	# threshold = 0.1,
	returnTies = FALSE
	)

	top_n_sent \|>
	dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) \|>
	dplyr::arrange(desc(value)) \|>
	dplyr::as_tibble()

	### replace sentenceSimil ----

	mat <-
	sent \|>
	tidytext::unnest_tokens(
	term,
	text,
	token = \(x) strsplit(x, " ", fixed = TRUE)
	) \|>
	dplyr::count(doc_id, term) \|>
	tidytext::bind_tf_idf(term, doc_id, n) \|>
	dplyr::mutate(tf_idf = n * (idf + 1)) \|>
	dplyr::inner_join(
	sent \|>
	tidytext::unnest_tokens(
	term,
	text,
	token = \(x) strsplit(x, " ", fixed = TRUE)
	) \|>
	dplyr::select(doc_id, sentence_id, term),
	by = c("doc_id" = "doc_id", "term" = "term")
	) \|>
	tidytext::cast_sparse(sentence_id, term, tf_idf)

	dt <- proxyC::simil(
	mat,
	method = "cosine",
	# min_simil = 0.1,
	# rank = 50,
	use_nan = FALSE
	)

	dt \|>
	tidytext:::tidy.dfm() \|>
	dplyr::rename(
	s1 = document,
	s2 = term,
	weight = count
	) \|>
	(\(simil_df) {
	lexRankr::lexRankFromSimil(
	simil_df$s1,
	simil_df$s2,
	simil = simil_df$weight,
	n = nrow(simil_df),
	usePageRank = TRUE,
	continuous = TRUE,
	# threshold = 0.1,
	returnTies = TRUE
	)
	})() \|>
	dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) \|>
	dplyr::arrange(desc(value)) \|>
	dplyr::as_tibble()

	### replace lexRankFromSimil ----

	dt \|>
	tidytext:::tidy.dfm() \|>
	dplyr::rename(
	s1 = document,
	s2 = term,
	weight = count
	) \|>
	(\(simil_df) {
	pr <- simil_df \|>
	igraph::graph_from_data_frame(
	directed = FALSE
	) \|>
	igraph::page_rank(
	directed = FALSE,
	damping = .85
	) \|>
	purrr::pluck("vector")
	tibble::tibble(
	sentenceId = names(pr),
	value = unname(pr)
	)
	})() \|>
	dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) \|>
	dplyr::arrange(desc(value)) \|>
	dplyr::as_tibble()
	require(sudachir)
	require(apportita)

	strj_split_boundaries <- \(x) {
	stringi::stri_split_boundaries(
	x,
	opts_brkiter = stringi::stri_opts_brkiter(
	locale = "ja@lw=phrase;ld=auto" # auto \| loose \| normal \| strict \| anywhere
	)
	)
	}

	split_kugire <- \(x) {
	strj_split_boundaries(x) \|>
	purrr::map(\(elem) {
	len <- length(elem)
	if (len < 2) {
	return(NA_character_)
	} else {
	sapply(seq_len(len - 1), \(i) {
	s1 <- paste0(elem[1:i], collapse = "")
	s2 <- paste0(elem[(i + 1):len], collapse = "")
	paste(s1, s2, sep = "\t")
	})
	}
	}) \|>
	unlist()
	}

	sudachi <- sudachir::rebuild_tokenizer(mode = "C")
	form <- \(x) {
	unlist(sudachir::form(x, type = "normalized", pos = FALSE,
	instance = sudachi))
	}
	wrd <- \(conn, s1, s2) {
	purrr::map2_dbl(s1, s2, \(el1, el2) {
	el1 <- form(el1)
	el2 <- form(el2)
	apportita::calc_wrd(conn, el1, el2)
	})
	}

	conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude")
	dim(conn)


	### tanka72 ----
	dat <-
	readxl::read_xlsx("data/tanka/tanka72.xlsx") \|>
	dplyr::transmute(
	id = factor(id),
	body = audubon::strj_normalize(body) \|>
	stringr::str_remove_all("[^[:alnum:]]+"),
	author = factor(author)
	)

	ret <- dat \|>
	dplyr::reframe(
	id = id,
	phrase = split_kugire(body),
	author = author,
	.by = id
	) \|>
	tidyr::separate_wider_delim(
	phrase, delim = "\t", names = c("s1", "s2")
	)

	ret <- ret \|>
	dplyr::mutate(
	wrd = wrd(conn, s1, s2)
	)

	arrow::write_parquet(ret, "tanka72-wrd.parquet")

	### tweets ----
	tweets <-
	readr::read_csv(
	"data/shinabitanori-230622.csv.gz",
	col_names = c("id", "time", "body"),
	col_types = "ccc"
	) \|>
	dplyr::filter(
	!stringr::str_detect(body, "@\|(RT)"),
	stringr::str_length(body) > 14,
	stringr::str_length(body) < 30
	) \|>
	dplyr::slice_sample(n = 100) \|>
	dplyr::transmute(
	id = factor(id),
	body = audubon::strj_normalize(body) \|>
	ldccr::clean_url() \|>
	stringr::str_remove_all("[^[:alnum:]]+")
	) \|>
	dplyr::filter(
	!stringi::stri_isempty(body)
	)

	tweets <- tweets \|>
	dplyr::reframe(
	id = id,
	phrase = split_kugire(body),
	.by = id
	) \|>
	tidyr::separate_wider_delim(
	phrase, delim = "\t", names = c("s1", "s2")
	)

	tweets <- tweets \|>
	dplyr::mutate(
	wrd = wrd(conn, s1, s2)
	)

	arrow::write_parquet(tweets, "shinabitanori-wrd.parquet")


	close(conn)
	rm(conn)
	clean_url <- \(text) {
	pat <- "(https?\|ftp)://([[a-zA-Z0-9]-]+\\.)+[[a-zA-Z0-9]-]+(/[[a-zA-Z0-9]- ./?%&=~]*)?"
	stringr::str_replace_all(text, pattern = pat, replacement = "URL")
	}

	to_df <- \(jsonlist) {
	purrr::map(jsonlist, \(elem) {
	text <- elem$rendered_body \|>
	rvest::read_html() \|>
	rvest::html_elements("p") \|>
	rvest::html_text() \|>
	paste(collapse = "\n")
	tags <- elem$tags \|>
	purrr::map_chr(~ .x$name) \|>
	paste(collapse = ",")

	tibble::tibble(
	"doc_id" = elem$id,
	"created" = elem$created_at,
	"updated" = elem$updated_at,
	"author" = elem$user$permanent_id,
	"title" = audubon::strj_normalize(elem$title),
	"text" = audubon::strj_normalize(text),
	"tags" = tags,
	"comments" = elem$comments_count,
	"likes" = elem$likes_count,
	"reactions" = elem$reactions_count,
	"stocks" = elem$stocks_count)
	}) \|>
	purrr::list_rbind() \|>
	dplyr::filter(
	cld3::detect_language(text) == "ja"
	) \|>
	dplyr::mutate(
	created = lubridate::as_date(created),
	updated = lubridate::as_date(updated)
	)
	}
	PAGES <- ceiling(4642 / 100)

	df <- seq_len(PAGES) \|>
	purrr::map(\(i) {
	li <- qiitr::qiita_get_items(
	tag_id = "r",
	per_page = 100, # range: 1-100
	page_offset = (i - 1), # 1 + page_offset < 100
	page_limit = 1
	)
	if (i %% 16 == 0) {
	rlang::inform(sprintf("Sleeping 16 minutes; currently took %dth page.", i))
	Sys.sleep(60 * 16)
	}
	return(to_df(li))
	}, .progress = TRUE) \|>
	purrr::list_rbind()

	arrow::write_parquet(df, here::here("data/qiita.parquet"))