alexpghayes/yehzee2.R

## yehzee2.R
library(tidyverse)
library(tidygraph)
library(netmediate)

data <- read.csv("final_regime.IGO.sanction.csv") |>
  # remove some columns of missing data to avoid downstream issues & unnnecessary columns
  select(-1, -AMCOW, -MOPAN, - year, -version)

node_data <- data |>
  select(country, ccode, polity2, democracy, sanction) |>
  as_tibble()

incidence <- data |>
  select(-country, -ccode, -stateabb, -polity2, -democracy, -durable, -sanction,
         -GDPpc, -milex, -milper, -irst, -pec, -tpop, -upop, -cinc) |>
  as.matrix()


rownames(incidence) <- data$country

# setting up the data

graph <- incidence |>
  igraph::graph_from_incidence_matrix() |>
  as_tbl_graph() |>
  activate(nodes) |> #select nodes
  left_join(node_data, by = c("name" = "country")) |> #merges the node_data tibble with the nodes of the graph, using the "name" column in the nodes and the "country" column in the node_data tibble as the matching columns. This adds the additional node data to the graph, such as the country code, state abbreviation, polity score, democracy score, etc.
  activate(edges) |>
  mutate(
    weight = 1
  )

# getting and plotting estimates

# unclear to me if trt should be as.factor(polity2) or polity2
# also unclear to me how sanction should be coded

curve <- graph |>
  activate(nodes) |>
  sensitivity_curve(
    sanction ~ as.factor(polity2),
    max_rank = 10,
    ranks_to_consider = 5,
    coembedding = "U"
  )

p1<- plot(curve) +
  theme_classic()


# checking the quality of the embeddings

library(vsp)

fa <- incidence |>
  as("CsparseMatrix") |>
  vsp(rank = 10)  #creates a factor analysis (FA) model from the incidence matrix data using the vsp package in R. The rank parameter specifies the number of factors to use in the model. The FA model attempts to find a set of underlying factors that explain the correlation between the variables in the incidence matrix.

plot_ipr_pairs(fa) #creates a plot of the eigenvalues of the FA model to help assess the quality of the embeddings. The eigenvalues indicate the amount of variance explained by each factor. A good embedding should have a few factors with high eigenvalues and many factors with low eigenvalues.

##The left singular vectors and right singular vectors are related to each other in that they represent different perspectives of the same data. Specifically, the right singular vectors represent the data in the original space, while the left singular vectors represent the data in the transformed space. In summary, the left and right singular vectors are a fundamental concept in the SVD (Singluar Vector Decomposition), and are used to represent the directions in which the matrix varies the most.

fa |>
  get_varimax_y() |>
  dplyr::select(-id) |>
  GGally::ggpairs(ggplot2::aes(alpha = 0.001)) +
  ggplot2::theme_minimal()
#This code block creates a scatterplot matrix of the embeddings generated by the FA model using the ggpairs function from the GGally package in R. The get_varimax_y() function applies a varimax rotation to the embeddings to help improve their interpretability. The dplyr::select(-id) function removes the ID column from the embeddings, which is not needed for visualization. The ggplot2::aes(alpha = 0.001) parameter sets the transparency of the points in the scatterplot matrix to 0.001.


fa |>
  get_varimax_z() |>
  dplyr::select(-id) |>
  GGally::ggpairs(ggplot2::aes(alpha = 0.001)) +
  ggplot2::theme_minimal()
#This code block creates another scatterplot matrix of the embeddings, but this time using the rotated embeddings generated by the varimax rotation. This can help improve the interpretability of the embeddings by aligning the axes with the underlying factors.

# embeddings seem possibly okay but certainly not great

##A good embedding plot should show clear separation or clustering of the data points based on their underlying structure or relationships. The quality of the embedding can be evaluated by examining the distribution and overlap of the points in the plot. In this case, the code suggests that the quality of the embedding is not great, as the plots show little clear separation or clustering of the data points.


############# updated data #######################################
node_data <- data |>
  # select variables that will be attached to 'country' nodes
  select(country, ccode, stateabb, polity2, democracy, durable, sanction, GDPpc, milex, milper,
         irst, pec, tpop, upop, cinc)

incidence <- data |>
  select(-country, -ccode, -stateabb, -polity2, -democracy, -durable, -sanction, -GDPpc, -milex, -milper,
         -irst, -pec, -tpop, -upop, -cinc) |>
  as.matrix()

rownames(incidence) <- data$country

# setting up the data

graph <- incidence |>
  igraph::graph_from_incidence_matrix() |>
  as_tbl_graph() |>
  activate(nodes) |> #select nodes
  left_join(node_data, by = c("name" = "country")) |> #merges the node_data tibble with the nodes of the graph, using the "name" column in the nodes and the "country" column in the node_data tibble as the matching columns. This adds the additional node data to the graph, such as the country code, state abbreviation, polity score, democracy score, etc.
  activate(edges) |>
  mutate(
    weight = 1
  ) |>
  activate(nodes)

# getting and plotting estimates

# unclear to me if trt should be as.factor(polity2) or polity2
# also unclear to me how sanction should be coded

curve <- graph |>
  activate(nodes) |>
  sensitivity_curve(
    sanction ~ polity2 + durable + GDPpc + milex + milper + irst + pec + tpop,
    max_rank = 10,
    ranks_to_consider = 5,
    coembedding = "U"
  )

curve |>
  filter(term == "polity2") |>
  plot() +
  theme_classic()

# US(A, 10) comes dimension ten (i.e. d = 10) Xhat

# check if latent position in network is associated with treatment
fit <- nodelm(US(A, 10) ~ polity2, graph)
summary(fit)
anova(fit)


# check if outcome is associated with treatment + latent position in network
fit <- nodelm(sanction ~ US(A, 20) + polity2, graph)
summary(fit)
	library(tidyverse)
	library(tidygraph)
	library(netmediate)

	data <- read.csv("final_regime.IGO.sanction.csv") \|>
	# remove some columns of missing data to avoid downstream issues & unnnecessary columns
	select(-1, -AMCOW, -MOPAN, - year, -version)

	node_data <- data \|>
	select(country, ccode, polity2, democracy, sanction) \|>
	as_tibble()

	incidence <- data \|>
	select(-country, -ccode, -stateabb, -polity2, -democracy, -durable, -sanction,
	-GDPpc, -milex, -milper, -irst, -pec, -tpop, -upop, -cinc) \|>
	as.matrix()


	rownames(incidence) <- data$country

	# setting up the data

	graph <- incidence \|>
	igraph::graph_from_incidence_matrix() \|>
	as_tbl_graph() \|>
	activate(nodes) \|> #select nodes
	left_join(node_data, by = c("name" = "country")) \|> #merges the node_data tibble with the nodes of the graph, using the "name" column in the nodes and the "country" column in the node_data tibble as the matching columns. This adds the additional node data to the graph, such as the country code, state abbreviation, polity score, democracy score, etc.
	activate(edges) \|>
	mutate(
	weight = 1
	)

	# getting and plotting estimates

	# unclear to me if trt should be as.factor(polity2) or polity2
	# also unclear to me how sanction should be coded

	curve <- graph \|>
	activate(nodes) \|>
	sensitivity_curve(
	sanction ~ as.factor(polity2),
	max_rank = 10,
	ranks_to_consider = 5,
	coembedding = "U"
	)

	p1<- plot(curve) +
	theme_classic()


	# checking the quality of the embeddings

	library(vsp)

	fa <- incidence \|>
	as("CsparseMatrix") \|>
	vsp(rank = 10) #creates a factor analysis (FA) model from the incidence matrix data using the vsp package in R. The rank parameter specifies the number of factors to use in the model. The FA model attempts to find a set of underlying factors that explain the correlation between the variables in the incidence matrix.

	plot_ipr_pairs(fa) #creates a plot of the eigenvalues of the FA model to help assess the quality of the embeddings. The eigenvalues indicate the amount of variance explained by each factor. A good embedding should have a few factors with high eigenvalues and many factors with low eigenvalues.

	##The left singular vectors and right singular vectors are related to each other in that they represent different perspectives of the same data. Specifically, the right singular vectors represent the data in the original space, while the left singular vectors represent the data in the transformed space. In summary, the left and right singular vectors are a fundamental concept in the SVD (Singluar Vector Decomposition), and are used to represent the directions in which the matrix varies the most.

	fa \|>
	get_varimax_y() \|>
	dplyr::select(-id) \|>
	GGally::ggpairs(ggplot2::aes(alpha = 0.001)) +
	ggplot2::theme_minimal()
	#This code block creates a scatterplot matrix of the embeddings generated by the FA model using the ggpairs function from the GGally package in R. The get_varimax_y() function applies a varimax rotation to the embeddings to help improve their interpretability. The dplyr::select(-id) function removes the ID column from the embeddings, which is not needed for visualization. The ggplot2::aes(alpha = 0.001) parameter sets the transparency of the points in the scatterplot matrix to 0.001.


	fa \|>
	get_varimax_z() \|>
	dplyr::select(-id) \|>
	GGally::ggpairs(ggplot2::aes(alpha = 0.001)) +
	ggplot2::theme_minimal()
	#This code block creates another scatterplot matrix of the embeddings, but this time using the rotated embeddings generated by the varimax rotation. This can help improve the interpretability of the embeddings by aligning the axes with the underlying factors.

	# embeddings seem possibly okay but certainly not great

	##A good embedding plot should show clear separation or clustering of the data points based on their underlying structure or relationships. The quality of the embedding can be evaluated by examining the distribution and overlap of the points in the plot. In this case, the code suggests that the quality of the embedding is not great, as the plots show little clear separation or clustering of the data points.




	############# updated data #######################################
	node_data <- data \|>
	# select variables that will be attached to 'country' nodes
	select(country, ccode, stateabb, polity2, democracy, durable, sanction, GDPpc, milex, milper,
	irst, pec, tpop, upop, cinc)

	incidence <- data \|>
	select(-country, -ccode, -stateabb, -polity2, -democracy, -durable, -sanction, -GDPpc, -milex, -milper,
	-irst, -pec, -tpop, -upop, -cinc) \|>
	as.matrix()

	rownames(incidence) <- data$country

	# setting up the data

	graph <- incidence \|>
	igraph::graph_from_incidence_matrix() \|>
	as_tbl_graph() \|>
	activate(nodes) \|> #select nodes
	left_join(node_data, by = c("name" = "country")) \|> #merges the node_data tibble with the nodes of the graph, using the "name" column in the nodes and the "country" column in the node_data tibble as the matching columns. This adds the additional node data to the graph, such as the country code, state abbreviation, polity score, democracy score, etc.
	activate(edges) \|>
	mutate(
	weight = 1
	) \|>
	activate(nodes)

	# getting and plotting estimates

	# unclear to me if trt should be as.factor(polity2) or polity2
	# also unclear to me how sanction should be coded

	curve <- graph \|>
	activate(nodes) \|>
	sensitivity_curve(
	sanction ~ polity2 + durable + GDPpc + milex + milper + irst + pec + tpop,
	max_rank = 10,
	ranks_to_consider = 5,
	coembedding = "U"
	)

	curve \|>
	filter(term == "polity2") \|>
	plot() +
	theme_classic()

	# US(A, 10) comes dimension ten (i.e. d = 10) Xhat

	# check if latent position in network is associated with treatment
	fit <- nodelm(US(A, 10) ~ polity2, graph)
	summary(fit)
	anova(fit)


	# check if outcome is associated with treatment + latent position in network
	fit <- nodelm(sanction ~ US(A, 20) + polity2, graph)
	summary(fit)