Skip to content

Instantly share code, notes, and snippets.

@paithiov909
Last active February 19, 2024 17:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paithiov909/7666e9fe29a62425ebb37c4107d46958 to your computer and use it in GitHub Desktop.
Save paithiov909/7666e9fe29a62425ebb37c4107d46958 to your computer and use it in GitHub Desktop.
Scripts
akita <- jisx0402::municipality |>
dplyr::filter(
pref_code == "05",
is.na(end_date),
!stringr::str_ends(name, "郡")
) |>
dplyr::mutate(
muni_code = paste0(pref_code, city_code),
muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
start_year = lubridate::year(start_date)
) |>
dplyr::select(muni_code, name, start_year) |>
dplyr::arrange(desc(start_year))
year <- max(akita$start_year, na.rm = TRUE)
dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |>
dplyr::filter(.data$year >= .env$year) |>
dplyr::right_join(akita, by = c("code" = "muni_code"))
require(ggplot2)
dat |>
dplyr::filter(name == "秋田市", age == "all") |>
ggplot(aes(year, population)) +
geom_line(aes(colour = sex)) +
scale_y_log10()
ga <- jisx0402::jptopography("designated") |>
dplyr::filter(stringr::str_starts(muni_code, "05")) |>
rlang::as_function(
~ dplyr::mutate(.,
muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
area = units::set_units(sf::st_area(.), km^2)
)
)() |>
dplyr::right_join(
dplyr::filter(dat, age == "all"),
by = c("muni_code" = "code")
) |>
dplyr::group_by(sex) |>
dplyr::mutate(
denst = round(population / area),
rank = dplyr::percent_rank(denst) * 10
) |>
dplyr::ungroup() |>
ggplot() +
geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
facet_wrap(~ sex) +
labs(
title = "Rank of Population Density in Akita, throughout 2006-2023",
subtitle = "Year: {frame_time}",
caption = paste(
"地図データ:「国土数値情報 行政区域データ」(国土交通省)",
"https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
sep = "\n"
)
) +
theme_light() +
gganimate::transition_time(year)
gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
gganimate::anim_save("akita-denst1.mp4", path = "Videos")
tokyo <- jisx0402::municipality |>
dplyr::filter(
pref_code == "13",
is.na(end_date),
!stringr::str_ends(name, "郡|(特別区)|(支庁)")
) |>
dplyr::mutate(
muni_code = paste0(pref_code, city_code),
muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
start_year = lubridate::year(start_date)
) |>
dplyr::select(muni_code, name, start_year) |>
dplyr::arrange(desc(start_year))
year <- max(tokyo$start_year, na.rm = TRUE)
dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |>
dplyr::filter(.data$year >= .env$year) |>
dplyr::right_join(tokyo, by = c("code" = "muni_code"))
require(ggplot2)
dat |>
dplyr::filter(age == "all", stringr::str_detect(name, "区"),
year > 2005L) |>
ggplot(aes(year, population)) +
geom_line(aes(colour = sex)) +
facet_wrap(~ name) +
scale_y_log10()
ga <- jisx0402::jptopography("all", resolution = 1) |>
dplyr::filter(stringr::str_starts(muni_code, "13")) |>
rlang::as_function(
~ dplyr::mutate(.,
muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
area = units::set_units(sf::st_area(.), km^2)
)
)() |>
dplyr::right_join(
dplyr::filter(dat, age == "all", stringr::str_detect(name, "区")),
by = c("muni_code" = "code")
) |>
dplyr::group_by(sex) |>
dplyr::mutate(
denst = round(population / area),
rank = dplyr::percent_rank(denst) * 10
) |>
dplyr::ungroup() |>
ggplot() +
geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
facet_wrap(~ sex) +
labs(
title = "Rank of Population Density in Tokyo Special Wards, throughout 2006-2023",
subtitle = "Year: {frame_time}",
caption = paste(
"地図データ:「国土数値情報 行政区域データ」(国土交通省)",
"https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
sep = "\n"
)
) +
theme_light() +
gganimate::transition_time(year, range = c(2006L, 2023L))
gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
gganimate::anim_save("tokyo23-denst1.mp4", path = "Videos")
#' @export
accelerate <- function(text = "精神を加速させろ") {
r"(
            ∩_∩
           / \ /\
          |(゚)=(゚)|
          |  ●_●  |
         /      ヽ
       r⌒| 〃 ------ ヾ |
      /  i/ |_二__ノ
     ./  /  /      )     {text}
     ./ /  /      //
    /   ./     / ̄
    .ヽ、__./     / ⌒ヽ
        r    /     |
      /          ノ
     /      /    /
    ./    //   /
    /.   ./ ./  /
   i   / ./ /
   i  ./ .ノ.^/
   i  ./  |_/
   i /
  / /
  (_/
)" |> stringi::stri_replace_all_fixed(" ", " ") |> glue::glue()
}
#' @export
qb <- function(text = "わけがわからないよ") {
r"(
        |\           /|
        |\\       //|
       :  ,> `´ ̄`´ <  ′
.       V            V
.       i< ●      ● >i
       八    、_,_,     八    {text}
.       / 个 . _  _ . 个 ',
   _/   il   ,'    '.  li  ',__
)" |> stringi::stri_replace_all_fixed(" ", " ") |> glue::glue()
}
#!/usr/bin/env Rscript
ID <- commandArgs(trailingOnly = TRUE)[1]
TYPE <- commandArgs(trailingOnly = TRUE)[2]
aznyan::get_lyrics_list(ID, ifelse(!is.na(TYPE), TYPE)) |>
aznyan::get_lyrics(paste0(ID, ".csv"))
#' Environment for internal use
#'
#' @noRd
#' @keywords internal
.env <- rlang::env(server i= NULL)
#' @noRd
reset_encoding <- function(chr, encoding = "UTF-8") {
Encoding(chr) <- encoding
return(chr)
}
#' Call lattice command
#'
#' A debug tool of tokenize process outputs a lattice in graphviz dot format.
#'
#' @param sentences Character vector.
#' @param ... Other arguments are passed to \code{DiagrammeR::grViz}.
#'
#' @export
get_lattice <- function(sentences, ...) {
senteces <- stringi::stri_omit_na(sentences)
dot <- processx::run("kagome", c("lattice", stringi::stri_c(sentences, collapse = " ")))$stdout
DiagrammeR::grViz(reset_encoding(dot), ...)
}
#' Lanch Kagome Server
#'
#' Start or kill Kagome server process.
#' @param dict Dictionary which kagome server uses. Default value is 'ipa' (IPA-dictionary).
#' @return Logical value (whether or not the kagome server is alive?) is returned invisibly.
#'
#' @export
lanch_server <- function(dict = c("ipa", "uni")) {
p <- rlang::env_get(.env, "server")
if (is.null(p)) {
dict <- rlang::arg_match(dict)
p <- processx::process$new("kagome", c("server", "-dict", dict))
rlang::env_bind(.env, server = p)
} else {
if (p$is_alive()) {
kill <- yesno::yesno("Kagome server is already alive. Do you want to kill its process?")
}
if (kill) {
p$kill()
rlang::env_bind(.env, server = NULL)
}
}
return(invisible(p$is_alive()))
}
#' Send a HEAD request to Kagome server
#'
#' @param url URL Character scalar.
#' @return httr2 response is returned invisibly.
#'
#' @export
ping <- function(url = "http://localhost:6060") {
resp <-
httr2::request(url) |>
httr2::req_method("HEAD") |>
httr2::req_perform()
return(invisible(resp))
}
#' Put request to tokenize API
#'
#' @param sentences Character vector to be analyzed.
#' @param url URL of Kagome server.
#' @param mode One of `normal`, `search` or `extended`.
#' @return tibble
#'
#' @export
tokenize <- function(sentences,
url = "http://localhost:6060/tokenize",
mode = c("normal", "search", "extended")) {
mode <- rlang::arg_match(mode)
sentences <-
stringi::stri_omit_na(sentences) |>
stringi::stri_split_boundaries(type = "sentence") |>
purrr::flatten_chr()
resps <-
furrr::future_imap_dfr(sentences, ~ tokenize_impl(.x, .y, url, mode)) |>
dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "*"))) |>
dplyr::mutate_at(c("doc_id", "class"), as.factor) |>
tibble::as_tibble()
return(resps)
}
#' @noRd
tokenize_impl <- function(msg, idx, url = "http://localhost:6060/tokenize", mode = "normal") {
resp <-
httr2::request(url) |>
httr2::req_body_json(list(
sentence = msg,
mode = mode
)) |>
httr2::req_method("PUT") |>
httr2::req_error(function(resp) httr2::resp_status(resp) > 400) |>
httr2::req_perform()
return(
purrr::map_dfr(
httr2::resp_body_json(resp)$tokens,
~ data.frame(
doc_id = idx,
id = .$id,
start = .$start,
end = .$end,
class = .$class,
token = .$surface,
POS1 = .$pos[[1]],
POS2 = .$pos[[2]],
POS3 = .$pos[[3]],
POS4 = .$pos[[4]],
Original = .$base_form,
Yomi1 = .$reading,
Yomi2 = .$pronunciation
)
)
)
}
.on_unload <- function(ns) {
p <- rlang::env_get(.env, "server")
if (!is.null(p) && p$is_alive()) {
p$kill()
}
}
texts <- ldccr::read_aozora(
"https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip",
directory = tempdir()
)
sent <- readr::read_lines(texts) |>
gibasa::tokenize(split = TRUE) |>
gibasa::prettify() |>
dplyr::filter(!POS1 %in% c("記号")) |>
dplyr::group_by(doc_id, sentence_id) |>
dplyr::reframe(
text = paste0(token, collapse = " ")
) |>
dplyr::mutate(
doc_id = doc_id,
sentence_id = paste(doc_id, sentence_id, sep = "_")
)
### lexRankr ----
tok <- sent |>
tidytext::unnest_tokens(
token,
text,
token = \(x) strsplit(x, " ", fixed = TRUE)
)
simil_df <- lexRankr::sentenceSimil(
sentenceId = tok$sentence_id,
token = tok$token,
docId = tok$doc_id
)
top_n_sent <-
lexRankr::lexRankFromSimil(
simil_df$sent1,
simil_df$sent2,
simil = simil_df$similVal,
n = 10,
usePageRank = TRUE,
continuous = TRUE,
# threshold = 0.1,
returnTies = FALSE
)
top_n_sent |>
dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
dplyr::arrange(desc(value)) |>
dplyr::as_tibble()
### replace sentenceSimil ----
mat <-
sent |>
tidytext::unnest_tokens(
term,
text,
token = \(x) strsplit(x, " ", fixed = TRUE)
) |>
dplyr::count(doc_id, term) |>
tidytext::bind_tf_idf(term, doc_id, n) |>
dplyr::mutate(tf_idf = n * (idf + 1)) |>
dplyr::inner_join(
sent |>
tidytext::unnest_tokens(
term,
text,
token = \(x) strsplit(x, " ", fixed = TRUE)
) |>
dplyr::select(doc_id, sentence_id, term),
by = c("doc_id" = "doc_id", "term" = "term")
) |>
tidytext::cast_sparse(sentence_id, term, tf_idf)
dt <- proxyC::simil(
mat,
method = "cosine",
# min_simil = 0.1,
# rank = 50,
use_nan = FALSE
)
dt |>
tidytext:::tidy.dfm() |>
dplyr::rename(
s1 = document,
s2 = term,
weight = count
) |>
(\(simil_df) {
lexRankr::lexRankFromSimil(
simil_df$s1,
simil_df$s2,
simil = simil_df$weight,
n = nrow(simil_df),
usePageRank = TRUE,
continuous = TRUE,
# threshold = 0.1,
returnTies = TRUE
)
})() |>
dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
dplyr::arrange(desc(value)) |>
dplyr::as_tibble()
### replace lexRankFromSimil ----
dt |>
tidytext:::tidy.dfm() |>
dplyr::rename(
s1 = document,
s2 = term,
weight = count
) |>
(\(simil_df) {
pr <- simil_df |>
igraph::graph_from_data_frame(
directed = FALSE
) |>
igraph::page_rank(
directed = FALSE,
damping = .85
) |>
purrr::pluck("vector")
tibble::tibble(
sentenceId = names(pr),
value = unname(pr)
)
})() |>
dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
dplyr::arrange(desc(value)) |>
dplyr::as_tibble()
require(sudachir)
require(apportita)
strj_split_boundaries <- \(x) {
stringi::stri_split_boundaries(
x,
opts_brkiter = stringi::stri_opts_brkiter(
locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere
)
)
}
split_kugire <- \(x) {
strj_split_boundaries(x) |>
purrr::map(\(elem) {
len <- length(elem)
if (len < 2) {
return(NA_character_)
} else {
sapply(seq_len(len - 1), \(i) {
s1 <- paste0(elem[1:i], collapse = "")
s2 <- paste0(elem[(i + 1):len], collapse = "")
paste(s1, s2, sep = "\t")
})
}
}) |>
unlist()
}
sudachi <- sudachir::rebuild_tokenizer(mode = "C")
form <- \(x) {
unlist(sudachir::form(x, type = "normalized", pos = FALSE,
instance = sudachi))
}
wrd <- \(conn, s1, s2) {
purrr::map2_dbl(s1, s2, \(el1, el2) {
el1 <- form(el1)
el2 <- form(el2)
apportita::calc_wrd(conn, el1, el2)
})
}
conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude")
dim(conn)
### tanka72 ----
dat <-
readxl::read_xlsx("data/tanka/tanka72.xlsx") |>
dplyr::transmute(
id = factor(id),
body = audubon::strj_normalize(body) |>
stringr::str_remove_all("[^[:alnum:]]+"),
author = factor(author)
)
ret <- dat |>
dplyr::reframe(
id = id,
phrase = split_kugire(body),
author = author,
.by = id
) |>
tidyr::separate_wider_delim(
phrase, delim = "\t", names = c("s1", "s2")
)
ret <- ret |>
dplyr::mutate(
wrd = wrd(conn, s1, s2)
)
arrow::write_parquet(ret, "tanka72-wrd.parquet")
### tweets ----
tweets <-
readr::read_csv(
"data/shinabitanori-230622.csv.gz",
col_names = c("id", "time", "body"),
col_types = "ccc"
) |>
dplyr::filter(
!stringr::str_detect(body, "@|(RT)"),
stringr::str_length(body) > 14,
stringr::str_length(body) < 30
) |>
dplyr::slice_sample(n = 100) |>
dplyr::transmute(
id = factor(id),
body = audubon::strj_normalize(body) |>
ldccr::clean_url() |>
stringr::str_remove_all("[^[:alnum:]]+")
) |>
dplyr::filter(
!stringi::stri_isempty(body)
)
tweets <- tweets |>
dplyr::reframe(
id = id,
phrase = split_kugire(body),
.by = id
) |>
tidyr::separate_wider_delim(
phrase, delim = "\t", names = c("s1", "s2")
)
tweets <- tweets |>
dplyr::mutate(
wrd = wrd(conn, s1, s2)
)
arrow::write_parquet(tweets, "shinabitanori-wrd.parquet")
close(conn)
rm(conn)
require(sudachir)
require(apportita)
strj_split_boundaries <- \(x) {
stringi::stri_split_boundaries(
x,
opts_brkiter = stringi::stri_opts_brkiter(
locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere
)
)
}
split_kugire <- \(x) {
strj_split_boundaries(x) |>
purrr::map(\(elem) {
len <- length(elem)
if (len < 2) {
return(NA_character_)
} else {
sapply(seq_len(len - 1), \(i) {
s1 <- paste0(elem[1:i], collapse = "")
s2 <- paste0(elem[(i + 1):len], collapse = "")
paste(s1, s2, sep = "\t")
})
}
}) |>
unlist()
}
sudachi <- sudachir::rebuild_tokenizer(mode = "C")
form <- \(x) {
unlist(sudachir::form(x, type = "normalized", pos = FALSE,
instance = sudachi))
}
wrd <- \(conn, s1, s2) {
purrr::map2_dbl(s1, s2, \(el1, el2) {
el1 <- form(el1)
el2 <- form(el2)
tryCatch(
apportita::calc_wrd(conn, el1, el2),
error = \(e) {
NA
}
)
}, .progress = TRUE)
}
conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude")
dim(conn)
### tanka ----
dat <-
readxl::read_excel(
"data/tanka/1001-1500_poems.xlsx",
col_names = c("id", "poems", "names", "loves", "likes", "keys1", "keys2")
) |>
# dplyr::slice_sample(n = 20) |>
dplyr::transmute(
id = id,
names = stringr::str_remove_all(names, "[(<U\\+[A-Z]>)]") |>
audubon::strj_normalize() |>
stringr::str_remove_all("[^[:alnum:]]+"),
body = stringr::str_remove_all(poems, "[(<U\\+[A-Z]>)]") |>
audubon::strj_normalize() |>
stringr::str_remove_all("[^[:alnum:]]+")
) |>
dplyr::filter(
!stringi::stri_isempty(body)
)
dat <- dat |>
dplyr::reframe(
id = id,
phrase = split_kugire(body),
.by = id
) |>
tidyr::separate_wider_delim(
phrase, delim = "\t", names = c("s1", "s2")
)
arrow::write_parquet(dat, "tanka-kugire.parquet")
print("区切り処理終わり")
dat <- dat |>
tidyr::drop_na() |>
dplyr::mutate(
wrd = wrd(conn, s1, s2)
)
arrow::write_parquet(dat, "tanka-wrd.parquet")
close(conn)
rm(conn)
clean_url <- \(text) {
pat <- "(https?|ftp)://([[a-zA-Z0-9]-]+\\.)+[[a-zA-Z0-9]-]+(/[[a-zA-Z0-9]- ./?%&=~]*)?"
stringr::str_replace_all(text, pattern = pat, replacement = "URL")
}
to_df <- \(jsonlist) {
purrr::map(jsonlist, \(elem) {
text <- elem$rendered_body |>
rvest::read_html() |>
rvest::html_elements("p") |>
rvest::html_text() |>
paste(collapse = "\n")
tags <- elem$tags |>
purrr::map_chr(~ .x$name) |>
paste(collapse = ",")
tibble::tibble(
"doc_id" = elem$id,
"created" = elem$created_at,
"updated" = elem$updated_at,
"author" = elem$user$permanent_id,
"title" = audubon::strj_normalize(elem$title),
"text" = audubon::strj_normalize(text),
"tags" = tags,
"comments" = elem$comments_count,
"likes" = elem$likes_count,
"reactions" = elem$reactions_count,
"stocks" = elem$stocks_count)
}) |>
purrr::list_rbind() |>
dplyr::filter(
cld3::detect_language(text) == "ja"
) |>
dplyr::mutate(
created = lubridate::as_date(created),
updated = lubridate::as_date(updated)
)
}
PAGES <- ceiling(4642 / 100)
df <- seq_len(PAGES) |>
purrr::map(\(i) {
li <- qiitr::qiita_get_items(
tag_id = "r",
per_page = 100, # range: 1-100
page_offset = (i - 1), # 1 + page_offset < 100
page_limit = 1
)
if (i %% 16 == 0) {
rlang::inform(sprintf("Sleeping 16 minutes; currently took %dth page.", i))
Sys.sleep(60 * 16)
}
return(to_df(li))
}, .progress = TRUE) |>
purrr::list_rbind()
arrow::write_parquet(df, here::here("data/qiita.parquet"))
tbl <- readxl::read_xlsx("Downloads/koe.xlsx") |>
tibble::rowid_to_column(var = "doc_id") |>
dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "NA"))) |>
tidyr::drop_na()
tbl |>
dplyr::select(Region, Sex, Age, Satis) |>
DataExplorer::plot_bar()
tbl <- tbl |>
dplyr::mutate(
doc_id = factor(doc_id),
Region = factor(Region),
Sex = factor(Sex),
Age = stringi::stri_trans_nfkc(Age),
Age = dplyr::case_match(
Age,
c("10代", "20代") ~ "20",
"30代" ~ "30",
"40代" ~ "40",
"50代" ~ "50",
c("60代", "70代") ~ "60"
),
Age = factor(Age),
Satis = stringr::str_detect(Satis, "満足"),
Opinion = audubon::strj_normalize(Opinion) |>
stringr::str_replace_all("[[:number:]]+", "N")
)
summary(tbl)
df <- tbl |>
dplyr::select(!Region) |>
gibasa::tokenize(Opinion) |>
gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |>
dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |>
gibasa::mute_tokens(
(!POS1 %in% c("名詞", "形容詞")) |
(token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行"))
) |>
gibasa::pack()
corp <- df |>
dplyr::right_join(
tbl |>
dplyr::mutate(Attr = paste(Age, Sex, sep = "_"), .keep = "unused"),
by = "doc_id"
) |>
quanteda::corpus()
require(ca)
cam <- corp |>
quanteda::tokens(what = "fastestword") |>
quanteda::dfm() |>
quanteda::dfm_group(groups = quanteda::docvars(corp, "Attr")) |>
quanteda::dfm_trim(min_termfreq = 10, max_termfreq = 50) |>
quanteda.textmodels::textmodel_ca()
plot(cam)
require(ggplot2)
dplyr::bind_rows(
as.data.frame(cam$rowcoord),
as.data.frame(cam$colcoord)
) |>
tibble::rownames_to_column() |>
dplyr::mutate(shape = stringr::str_detect(rowname, "_")) |>
ggplot(aes(Dim1, Dim2, shape = shape, colour = shape)) +
geom_vline(xintercept = 0, colour = "grey", linetype = "dashed") +
geom_hline(yintercept = 0, colour = "grey", linetype = "dashed") +
geom_point() +
ggrepel::geom_text_repel(aes(label = rowname), max.overlaps = 20) +
theme_light() +
theme(legend.position = "none")
df <- tbl |>
dplyr::select(!Region) |>
gibasa::tokenize(Opinion) |>
gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |>
dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |>
gibasa::mute_tokens(
(!POS1 %in% c("名詞", "形容詞")) |
(token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行"))
) |>
dtplyr::lazy_dt() |>
dplyr::group_by(doc_id) |>
dplyr::mutate(term1 = token, term2 = dplyr::lead(term1)) |>
dplyr::add_count(term1, term2, name = "cooc") |>
dplyr::ungroup() |>
dplyr::select(doc_id, token, term1, term2, cooc)
biterms <- df |>
dplyr::select(doc_id, term1, term2, cooc) |>
tidyr::drop_na() |>
dplyr::as_tibble()
model <- df |>
dplyr::select(doc_id, token) |>
dplyr::as_tibble() |>
BTM::BTM(k = 5, background = TRUE, biterms = biterms)
plot(model, top_n = 20)
# R Benchmark 2.5 (06/2008) [Simon Urbanek]
# version 2.5: scaled to get roughly 1s per test, R 2.7.0 @ 2.6GHz Mac Pro
# R Benchmark 2.4 (06/2008) [Simon Urbanek]
# version 2.4 adapted to more recent Matrix package
# R Benchmark 2.3 (21 April 2004)
# Warning: changes are not carefully checked yet!
# version 2.3 adapted to R 1.9.0
# Many thanks to Douglas Bates (bates@stat.wisc.edu) for improvements!
# version 2.2 adapted to R 1.8.0
# version 2.1 adapted to R 1.7.0
# version 2, scaled to get 1 +/- 0.1 sec with R 1.6.2
# using the standard ATLAS library (Rblas.dll)
# on a Pentium IV 1.6 Ghz with 1 Gb Ram on Win XP pro
# revised and optimized for R v. 1.5.x, 8 June 2002
# Requires additionnal libraries: Matrix, SuppDists
# Author : Philippe Grosjean
# eMail : phgrosjean@sciviews.org
# Web : http://www.sciviews.org
# License: GPL 2 or above at your convenience (see: http://www.gnu.org)
#
# Several tests are adapted from the Splus Benchmark Test V. 2
# by Stephan Steinhaus (stst@informatik.uni-frankfurt.de)
# Reference for Escoufier's equivalents vectors (test III.5):
# Escoufier Y., 1970. Echantillonnage dans une population de variables
# aleatoires r?elles. Publ. Inst. Statis. Univ. Paris 19 Fasc 4, 1-47.
#
# type source("c:/<dir>/R2.R") to start the test
runs <- 3 # Number of times the tests are executed
times <- rep(0, 15); dim(times) <- c(5,3)
require(Matrix) # Optimized matrix operations
#require(SuppDists) # Optimized random number generators
#Runif <- rMWC1019 # The fast uniform number generator
Runif <- runif
# If you don't have SuppDists, you can use: Runif <- runif
#a <- rMWC1019(10, new.start=TRUE, seed=492166) # Init. the generator
#Rnorm <- rziggurat # The fast normal number generator
# If you don't have SuppDists, you can use: Rnorm <- rnorm
#b <- rziggurat(10, new.start=TRUE) # Init. the generator
Rnorm <- rnorm
#remove("a", "b")
options(object.size=100000000)
cat("\n\n R Benchmark 2.5\n")
cat(" ===============\n")
cat(c("Number of times each test is run__________________________: ", runs))
cat("\n\n")
cat(" I. Matrix calculation\n")
cat(" ---------------------\n")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (1)
cumulate <- 0; a <- 0; b <- 0
for (i in 1:runs) {
invisible(gc())
timing <- system.time({
a <- matrix(Rnorm(2500*2500)/10, ncol=2500, nrow=2500);
b <- t(a);
dim(b) <- c(1250, 5000);
a <- t(b)
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[1, 1] <- timing
cat(c("Creation, transp., deformation of a 2500x2500 matrix (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (2)
cumulate <- 0; b <- 0
for (i in 1:runs) {
a <- abs(matrix(Rnorm(2500*2500)/2, ncol=2500, nrow=2500));
invisible(gc())
timing <- system.time({
b <- a^1000
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[2, 1] <- timing
cat(c("2400x2400 normal distributed random matrix ^1000____ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (3)
cumulate <- 0; b <- 0
for (i in 1:runs) {
a <- Rnorm(7000000)
invisible(gc())
timing <- system.time({
b <- sort(a, method="quick") # Sort is modified in v. 1.5.x
# And there is now a quick method that better competes with other packages!!!
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[3, 1] <- timing
cat(c("Sorting of 7,000,000 random values__________________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (4)
cumulate <- 0; b <- 0
for (i in 1:runs) {
a <- Rnorm(2800*2800); dim(a) <- c(2800, 2800)
invisible(gc())
timing <- system.time({
b <- crossprod(a) # equivalent to: b <- t(a) %*% a
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[4, 1] <- timing
cat(c("2800x2800 cross-product matrix (b = a' * a)_________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (5)
cumulate <- 0; c <- 0; qra <-0
for (i in 1:runs) {
a <- new("dgeMatrix", x = Rnorm(2000*2000), Dim = as.integer(c(2000,2000)))
b <- as.double(1:2000)
invisible(gc())
timing <- system.time({
c <- solve(crossprod(a), crossprod(a,b))
})[3]
cumulate <- cumulate + timing
# This is the old method
#a <- Rnorm(600*600); dim(a) <- c(600,600)
#b <- 1:600
#invisible(gc())
#timing <- system.time({
# qra <- qr(a, tol = 1e-7);
# c <- qr.coef(qra, b)
# #Rem: a little faster than c <- lsfit(a, b, inter=F)$coefficients
#})[3]
#cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[5, 1] <- timing
cat(c("Linear regr. over a 3000x3000 matrix (c = a \\ b')___ (sec): ", timing, "\n"))
remove("a", "b", "c", "qra")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
times[ , 1] <- sort(times[ , 1])
cat(" --------------------------------------------\n")
cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 1]))), "\n\n"))
cat(" II. Matrix functions\n")
cat(" --------------------\n")
if (R.Version()$os == "Win32") flush.console()
# (1)
cumulate <- 0; b <- 0
for (i in 1:runs) {
a <- Rnorm(2400000)
invisible(gc())
timing <- system.time({
b <- fft(a)
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[1, 2] <- timing
cat(c("FFT over 2,400,000 random values____________________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (2)
cumulate <- 0; b <- 0
for (i in 1:runs) {
a <- array(Rnorm(600*600), dim = c(600, 600))
# Only needed if using eigen.Matrix(): Matrix.class(a)
invisible(gc())
timing <- system.time({
b <- eigen(a, symmetric=FALSE, only.values=TRUE)$Value
# Rem: on my machine, it is faster than:
# b <- La.eigen(a, symmetric=F, only.values=T, method="dsyevr")$Value
# b <- La.eigen(a, symmetric=F, only.values=T, method="dsyev")$Value
# b <- eigen.Matrix(a, vectors = F)$Value
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[2, 2] <- timing
cat(c("Eigenvalues of a 640x640 random matrix______________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (3)
cumulate <- 0; b <- 0
for (i in 1:runs) {
a <- Rnorm(2500*2500); dim(a) <- c(2500, 2500)
#Matrix.class(a)
invisible(gc())
timing <- system.time({
#b <- determinant(a, logarithm=F)
# Rem: the following is slower on my computer!
# b <- det.default(a)
b <- det(a)
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[3, 2] <- timing
cat(c("Determinant of a 2500x2500 random matrix____________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (4)
cumulate <- 0; b <- 0
for (i in 1:runs) {
a <- crossprod(new("dgeMatrix", x = Rnorm(3000*3000),
Dim = as.integer(c(3000, 3000))))
invisible(gc())
#a <- Rnorm(900*900); dim(a) <- c(900, 900)
#a <- crossprod(a, a)
timing <- system.time({
b <- chol(a)
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[4, 2] <- timing
cat(c("Cholesky decomposition of a 3000x3000 matrix________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (5)
cumulate <- 0; b <- 0
for (i in 1:runs) {
a <- new("dgeMatrix", x = Rnorm(1600*1600), Dim = as.integer(c(1600, 1600)))
invisible(gc())
#a <- Rnorm(400*400); dim(a) <- c(400, 400)
timing <- system.time({
# b <- qr.solve(a)
# Rem: a little faster than
b <- solve(a)
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[5, 2] <- timing
cat(c("Inverse of a 1600x1600 random matrix________________ (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
times[ , 2] <- sort(times[ , 2])
cat(" --------------------------------------------\n")
cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 2]))), "\n\n"))
cat(" III. Programmation\n")
cat(" ------------------\n")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (1)
cumulate <- 0; a <- 0; b <- 0; phi <- 1.6180339887498949
for (i in 1:runs) {
a <- floor(Runif(3500000)*1000)
invisible(gc())
timing <- system.time({
b <- (phi^a - (-phi)^(-a))/sqrt(5)
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[1, 3] <- timing
cat(c("3,500,000 Fibonacci numbers calculation (vector calc)(sec): ", timing, "\n"))
remove("a", "b", "phi")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (2)
cumulate <- 0; a <- 3000; b <- 0
for (i in 1:runs) {
invisible(gc())
timing <- system.time({
b <- rep(1:a, a); dim(b) <- c(a, a);
b <- 1 / (t(b) + 0:(a-1))
# Rem: this is twice as fast as the following code proposed by R programmers
# a <- 1:a; b <- 1 / outer(a - 1, a, "+")
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[2, 3] <- timing
cat(c("Creation of a 3000x3000 Hilbert matrix (matrix calc) (sec): ", timing, "\n"))
remove("a", "b")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (3)
cumulate <- 0; c <- 0
gcd2 <- function(x, y) {if (sum(y > 1.0E-4) == 0) x else {y[y == 0] <- x[y == 0]; Recall(y, x %% y)}}
for (i in 1:runs) {
a <- ceiling(Runif(400000)*1000)
b <- ceiling(Runif(400000)*1000)
invisible(gc())
timing <- system.time({
c <- gcd2(a, b) # gcd2 is a recursive function
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[3, 3] <- timing
cat(c("Grand common divisors of 400,000 pairs (recursion)__ (sec): ", timing, "\n"))
remove("a", "b", "c", "gcd2")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (4)
cumulate <- 0; b <- 0
for (i in 1:runs) {
b <- rep(0, 500*500); dim(b) <- c(500, 500)
invisible(gc())
timing <- system.time({
# Rem: there are faster ways to do this
# but here we want to time loops (220*220 'for' loops)!
for (j in 1:500) {
for (k in 1:500) {
b[k,j] <- abs(j - k) + 1
}
}
})[3]
cumulate <- cumulate + timing
}
timing <- cumulate/runs
times[4, 3] <- timing
cat(c("Creation of a 500x500 Toeplitz matrix (loops)_______ (sec): ", timing, "\n"))
remove("b", "j", "k")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
# (5)
cumulate <- 0; p <- 0; vt <- 0; vr <- 0; vrt <- 0; rvt <- 0; RV <- 0; j <- 0; k <- 0;
x2 <- 0; R <- 0; Rxx <- 0; Ryy <- 0; Rxy <- 0; Ryx <- 0; Rvmax <- 0
# Calculate the trace of a matrix (sum of its diagonal elements)
Trace <- function(y) {sum(c(y)[1 + 0:(min(dim(y)) - 1) * (dim(y)[1] + 1)], na.rm=FALSE)}
for (i in 1:runs) {
x <- abs(Rnorm(45*45)); dim(x) <- c(45, 45)
invisible(gc())
timing <- system.time({
# Calculation of Escoufier's equivalent vectors
p <- ncol(x)
vt <- 1:p # Variables to test
vr <- NULL # Result: ordered variables
RV <- 1:p # Result: correlations
vrt <- NULL
for (j in 1:p) { # loop on the variable number
Rvmax <- 0
for (k in 1:(p-j+1)) { # loop on the variables
x2 <- cbind(x, x[,vr], x[,vt[k]])
R <- cor(x2) # Correlations table
Ryy <- R[1:p, 1:p]
Rxx <- R[(p+1):(p+j), (p+1):(p+j)]
Rxy <- R[(p+1):(p+j), 1:p]
Ryx <- t(Rxy)
rvt <- Trace(Ryx %*% Rxy) / sqrt(Trace(Ryy %*% Ryy) * Trace(Rxx %*% Rxx)) # RV calculation
if (rvt > Rvmax) {
Rvmax <- rvt # test of RV
vrt <- vt[k] # temporary held variable
}
}
vr[j] <- vrt # Result: variable
RV[j] <- Rvmax # Result: correlation
vt <- vt[vt!=vr[j]] # reidentify variables to test
}
})[3]
cumulate <- cumulate + timing
}
times[5, 3] <- timing
cat(c("Escoufier's method on a 45x45 matrix (mixed)________ (sec): ", timing, "\n"))
remove("x", "p", "vt", "vr", "vrt", "rvt", "RV", "j", "k")
remove("x2", "R", "Rxx", "Ryy", "Rxy", "Ryx", "Rvmax", "Trace")
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()
times[ , 3] <- sort(times[ , 3])
cat(" --------------------------------------------\n")
cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 3]))), "\n\n\n"))
cat(c("Total time for all 15 tests_________________________ (sec): ", sum(times), "\n"))
cat(c("Overall mean (sum of I, II and III trimmed means/3)_ (sec): ", exp(mean(log(times[2:4, ]))), "\n"))
remove("cumulate", "timing", "times", "runs", "i")
cat(" --- End of test ---\n\n")
suppressPackageStartupMessages({
require(tidymodels)
})
ames_split <- initial_split(modeldata::ames, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test <- testing(ames_split)
ames_rec <- recipe(Sale_Price ~ ., data = ames_train) |>
step_mutate(Sale_Price = log1p(Sale_Price)) |>
step_nzv(all_predictors()) |>
step_select(
all_outcomes(),
all_numeric_predictors(),
!starts_with("Year"), !ends_with("tude")
) |>
step_normalize(all_numeric_predictors())
# injection
# https://parsnip.tidymodels.org/reference/model_spec.html
num_trees <- 1200L
ames_spec <- rand_forest(mtry = tune(), trees = !!num_trees) |>
set_mode("regression") |>
set_engine("ranger") |>
set_args(num.threads = !!parallelly::availableCores(omit = 1), seed = 1234)
str(ames_spec)
base_wf <-
workflow() |>
add_recipe(ames_rec) |>
add_model(
rand_forest(trees = !!num_trees) |>
set_engine("ranger") |>
set_mode("regression") |>
set_args(num.threads = !!parallelly::availableCores(omit = 1)),
formula = expm1(Sale_Price) ~ .
) |>
fit(ames_train)
augment(
extract_fit_parsnip(base_wf),
new_data = prep(ames_rec) |> bake(new_data = ames_test)
) |>
dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す
metrics(truth = Sale_Price, estimate = .pred)
ames_grid <- workflow() |>
add_recipe(ames_rec) |>
add_model(ames_spec, formula = Sale_Price ~ .) |>
tune_grid(
resamples = vfold_cv(ames_train, v = 3),
grid = grid_latin_hypercube(
extract_parameter_set_dials(ames_spec) |>
finalize(prep(ames_rec) |> bake(new_data = NULL)),
size = 10
),
control = control_grid(verbose = FALSE),
metrics = metric_set(rmse)
)
select_best(ames_grid)
show_best(ames_grid)
best_wf <-
workflow() |>
add_recipe(ames_rec) |>
add_model(ames_spec, formula = expm1(Sale_Price) ~ .) |>
finalize_workflow(select_best(ames_grid))
best_fit <- last_fit(best_wf, ames_split)
collect_predictions(best_fit)
collect_predictions(best_fit) |>
dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す
metrics(truth = Sale_Price, estimate = .pred)
# model formula
# https://github.com/tidymodels/parsnip/blob/8f13c1c41ce603261f25af64694c253f618fa999/R/model_formula.R
linear_reg(penalty = 1.0) |>
set_engine("glmnet") |>
set_mode("regression")
require(audio.whisper)
model <- whisper("models/ggml-base.bin")
trans <- predict(model, "output.wav", language = "ja")
arrow::write_parquet(trans$tokens, "output.wav.parquet")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment