tmalsburg/chinese_color_terms.R

## chinese_color_terms.R

library(ggplot2)
library(dplyr)

options(scipen=999)

N <- 100000  # Number of simulations

# Data:

onsets <- tibble(
  onset = c("b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h",
            "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s"),
  count = c(16, 16, 18, 9, 21, 19, 23, 24, 18, 18, 19, 13, 14, 14, 20,
            18, 19, 14, 17, 16, 16),
  prob  = count / sum(count))

# Assuming that color terms have random onsets
# (H0), how likely is it that warm colors (yellow,
# red) share the same onset and cold colors (blue,
# green) share the same onset?

sapply(1:N, function(i) {
  with(onsets,
       sample(onset, 4, replace=TRUE, prob=prob))
}) |>
  t() |>
  data.frame() |>
  dplyr::rename(yellow=X1, red=X2, blue=X3, green=X4) -> sim

# Count cases where “yellow” and “red”, and “blue”
# and “green” share the same onset:

sim |>
  filter(
    yellow == red,
    blue   == green,
    yellow != green) |>
  nrow() -> i

# p-value:
i/N -> p1

# Assuming that “black” and “white” have random
# onsets (H0), how likely is it that they start
# with onsets that are extremely dissimilar with
# respect to place and mode of articulation?

# Assign random onsets to black and white and
# simulate N time:

sapply(1:N, function(i) {
  with(onsets,
       sample(onset, 2, replace=TRUE, prob=prob))
}) |>
  t() |>
  data.frame() |>
  dplyr::rename(black=X1, white=X2) -> sim

# Count cases with extremely dissimilar onsets:
sim |>
  filter(
    case_when(
      black %in% c("p", "b") & white %in% c("h")           ~ TRUE,
      black %in% c("m")      & white %in% c("g", "k", "h") ~ TRUE,
      black %in% c("g", "k") & white %in% c("m", "f")      ~ TRUE,
      black %in% c("f")      & white %in% c("g", "k")      ~ TRUE,
      white %in% c("p", "b") & black %in% c("h")           ~ TRUE,
      white %in% c("m")      & black %in% c("g", "k", "h") ~ TRUE,
      white %in% c("g", "k") & black %in% c("m", "f")      ~ TRUE,
      white %in% c("f")      & black %in% c("g", "k")      ~ TRUE,
      .default = FALSE)
  ) |>
  nrow() -> i

# p-value:
i/N  -> p2

p1 * p2 -> p3

message("p1: ", p1)
message("p2: ", p2)
message("p3: ", p3)

	library(ggplot2)
	library(dplyr)

	options(scipen=999)

	N <- 100000 # Number of simulations

	# Data:

	onsets <- tibble(
	onset = c("b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h",
	"j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s"),
	count = c(16, 16, 18, 9, 21, 19, 23, 24, 18, 18, 19, 13, 14, 14, 20,
	18, 19, 14, 17, 16, 16),
	prob = count / sum(count))

	# Assuming that color terms have random onsets
	# (H0), how likely is it that warm colors (yellow,
	# red) share the same onset and cold colors (blue,
	# green) share the same onset?

	sapply(1:N, function(i) {
	with(onsets,
	sample(onset, 4, replace=TRUE, prob=prob))
	}) \|>
	t() \|>
	data.frame() \|>
	dplyr::rename(yellow=X1, red=X2, blue=X3, green=X4) -> sim

	# Count cases where “yellow” and “red”, and “blue”
	# and “green” share the same onset:

	sim \|>
	filter(
	yellow == red,
	blue == green,
	yellow != green) \|>
	nrow() -> i

	# p-value:
	i/N -> p1

	# Assuming that “black” and “white” have random
	# onsets (H0), how likely is it that they start
	# with onsets that are extremely dissimilar with
	# respect to place and mode of articulation?

	# Assign random onsets to black and white and
	# simulate N time:

	sapply(1:N, function(i) {
	with(onsets,
	sample(onset, 2, replace=TRUE, prob=prob))
	}) \|>
	t() \|>
	data.frame() \|>
	dplyr::rename(black=X1, white=X2) -> sim

	# Count cases with extremely dissimilar onsets:
	sim \|>
	filter(
	case_when(
	black %in% c("p", "b") & white %in% c("h") ~ TRUE,
	black %in% c("m") & white %in% c("g", "k", "h") ~ TRUE,
	black %in% c("g", "k") & white %in% c("m", "f") ~ TRUE,
	black %in% c("f") & white %in% c("g", "k") ~ TRUE,
	white %in% c("p", "b") & black %in% c("h") ~ TRUE,
	white %in% c("m") & black %in% c("g", "k", "h") ~ TRUE,
	white %in% c("g", "k") & black %in% c("m", "f") ~ TRUE,
	white %in% c("f") & black %in% c("g", "k") ~ TRUE,
	.default = FALSE)
	) \|>
	nrow() -> i

	# p-value:
	i/N -> p2

	p1 * p2 -> p3

	message("p1: ", p1)
	message("p2: ", p2)
	message("p3: ", p3)