svendvn/calculate_LDN_and_MDS.Rmd

## calculate_LDN_and_MDS.Rmd
---
title: "Linguistic differences"
output: html_notebook
---

Read in all the data. In the working directory, I have put several files from the AJSP database. Any subset of files will do.

```{r}
lf=list.files(pattern=".csv")

#res is going to contain all words in the database. Becase not all Swadeshlists are equally long, I put it at max length 100
res=matrix(0,nrow=100, ncol=length(lf))
cnames=rep("", length(lf))#names of the languages
for(i in 1:length(lf)){
  l=lf[i]

  #still haven't learnt regular expresions:
  name=strsplit( strsplit(l, "-")[[1]][3],  ".", fixed=T)[[1]][1]
  cnames[i]=name

  #load file and extrat relevant information.
  ad=read.csv(l)
  res[as.numeric(ad$Parameter_ID),i]=as.character(ad$Value)
}
res=data.frame(res)
colnames(res) <- cnames
```

On this dataset we use Levenshtein distance with the length-of-word-correction, LDN (but not LDND, which I am not convinced of).

```{r}
library(stringdist)

#this will be th LDN distance matrix in the nd
resulting_matrix=matrix(0, nrow=ncol(res), ncol=ncol(res))

#this is the denominator in the average.
counter_matrix=resulting_matrix

#function that calculates the longest word for each entry in the outer product v1%o%v2
len_f=function(v1,v2){
  res=matrix(0, nrow=length(v1),ncol=length(v2))
  for(i in 1:length(v1)){
    for(j in 1:length(v2)){
      res[i,j]=max(nchar(v1[i]), nchar(v2[j]))
    }
  }
  return(res)
}

#calculates the sum and denominator in the average cummulatively for all entries.
for(i in 1:nrow(res)){

  #x is the translation of a certain term to all languages
  x=as.character(unlist(res[i,]))

  #calculates levenshtein differences and rem the missing values
  to_add=stringdistmatrix(x,x, method="lv")*(x!="0")%o%(x!="0")

  #the len_f(x,x) term is the N in LDN.
  counter_matrix=counter_matrix+len_f(x,x)*(to_add>0)
  resulting_matrix=resulting_matrix+to_add
}

#the average is calculated.
resulting_matrix[counter_matrix>0]=resulting_matrix[counter_matrix>0]/counter_matrix[counter_matrix>0]
rownames(resulting_matrix) <- cnames
colnames(resulting_matrix) <- cnames
print(resulting_matrix[1:7,1:7]) #testing heatmap
```

In the following section I run MDS. As seen from

```{r}
library(vegan)
weights=read.csv("speakers.txt")
weights=weights[order(weights$Language),]#ordering the languages alphabetically like in resultingmatrix

#the artificially introduced languages
zero_weights=c("DANISH","ESPERANTO","LOJBAN", "FINNISH")
weights.v=ifelse(weights$Language%in% zero_weights, 0.01,1)

#using weighted mds here.
mds=wcmdscale(resulting_matrix,k=2, w=weights.v)
df=data.frame(mds)
colnames(df) <- c("MDS1","MDS2")
rownames(df) <- cnames
```

Here I calculate the loss/appropriateness of each point

```{r}
#given a row index in the mds-matrix, this funcion calculates the differene between that row an all other rows, putting it in a vector.
get_dists=function(i){
  dists=rep(0, nrow(df))
  for(j in 1:nrow(df)){
    dists[j]=sum(df[i,1:2]-df[j,1:2])^2
  }
  return(dists)
}

#calculaes the normalised differecen between actual and estimation distance
loss_f=function(i){
  dists=get_dists(i)
  imp_dists=which(dists>0)
  return(sum((resulting_matrix[imp_dists,i]-dists[imp_dists])^2)/sum(dists[imp_dists]))
}

df$loss=sapply(1:nrow(df), loss_f)


```

Here I plot the two MDS plots.

```{r}
library(ggplot2)

#some manual data handlng to get the labels and names nicer.
position_bottom=c("FRENCH","SPANISH","FRENCH","HAKKA","GUJARATI","RUSSIAN", "JAPANESE", "MANDARIN","THAI","VIETNAMESE","OLD_OR_MIDDLE_JAVANESE","BHOJPURI","MAITHILI","NORTHERN_PASHTO","HINDI")
position_left=c("KOREAN","PUNJABI_MAJHI","HAUSA","KANNADA","TAGALOG","TURKISH","TELUGU","BURMESE","FUZHOU_CHINESE")
position_right=c("MARATHI","SWAHILI","SUNDANESE","ORIYA","ESPERANTO")
name=rbind(c("NORTHERN_PASHTO","PASHTO"),
           c("OLD_OR_MIDDLE_JAVANESE","JAVANESE"),
           c("STANDARD_ARABIC","ARABIC"),
           c("STANDARD_GERMAN","GERMAN"),
           c("PUNJABI_MAJHI","PUNJABI"))
df$position=ifelse(rownames(df)%in%position_bottom, "bottom", "top")
df$position[df$position=="top"]=ifelse(rownames(df[df$position=="top",])%in%position_left, "left","top")
df$position[df$position=="top"]=ifelse(rownames(df[df$position=="top",])%in%position_right, "right","top")
df$wname=rownames(df)
for(i in 1:nrow(name)){
  df$wname[which(df$wname==name[i,1])]=name[i,2]
}

#size of plot window and text.
h=700
w=800
s=5

#plots all points at once butadds the text differently for the 4 groups of placement relative to the point.
png(filename = "MDS.png", height = h, width = w)
p <- ggplot(df, aes(MDS1, MDS2))+geom_point()+
  geom_text(label=df$wname[which(df$position=="top")], size=s, data=subset(df, position=="top"), nudge_y=0.015)+
  geom_text(label=df$wname[which(df$position=="bottom")], size=s, data=subset(df, position=="bottom"), nudge_y=-0.010)+
  geom_text(label=df$wname[which(df$position=="left")], size=s, data=subset(df, position=="left"), nudge_x=-0.03)+
  geom_text(label=df$wname[which(df$position=="right")], size=s, data=subset(df, position=="right"), nudge_x=0.035)
p
dev.off()

png(filename= "MDSfit.png", height = h, width = w)
p <- ggplot(df, aes(MDS1, MDS2))+geom_point(aes(color=loss))+
  geom_text(label=df$wname[which(df$position=="top")], size=s, data=subset(df, position=="top"), nudge_y=0.015)+
  geom_text(label=df$wname[which(df$position=="bottom")], size=s, data=subset(df, position=="bottom"), nudge_y=-0.010)+
  geom_text(label=df$wname[which(df$position=="left")], size=s, data=subset(df, position=="left"), nudge_x=-0.03)+
  geom_text(label=df$wname[which(df$position=="right")], size=s, data=subset(df, position=="right"), nudge_x=0.035)+ scale_colour_gradient(high = "red", low="green")
p
dev.off()


```

Plotting the distance matrix.

```{r}
res2=resulting_matrix
colnames(res2) <- df$wname
rownames(res2) <- colnames(res2)
png("dendogram.png", height=800, width=800)
heatmap(res2^2, symm=T)
```

In the last section I discover that the average distance between languages in this uncomplete dataset(as in only having a subset of languages) is too influenced by the selection of the languages. So this part is skipped in the article.

```{r}
#
weights=weights[order(weights$Language),]
score_function=function(x){
  return(sum(weights$native*exp(x)))
}
#cnames
#sort(apply(resulting_matrix,1,score_function))
sort(apply(resulting_matrix,1,sum))
```


## speakers.txt
Language,native,l2speakers
MANDARIN,0.1482,
SPANISH,0.0585,
ENGLISH,0.052,
HINDI,0.046,
STANDARD_ARABIC,0.0423,
PORTUGUESE,0.0308,
BENGALI,0.0305,
RUSSIAN,0.0242,
JAPANESE,0.0192,
PUNJABI_MAJHI,0.0144,
STANDARD_GERMAN,0.0139,
OLD_OR_MIDDLE_JAVANESE,0.0125,
SUZHOU_WU,0.0120,
MALAY,0.0116,
TELUGU,0.0115,
VIETNAMESE,0.0114,
KOREAN,0.0114,
FRENCH,0.0112,
MARATHI,0.0110,
TAMIL,0.0106,
URDU,0.0099,
TURKISH,0.0095,
ITALIAN,0.009,
CANTONESE,0.0089,
THAI,0.0085,
GUJARATI,0.0074,
FUZHOU_CHINESE,0.0071,
PERSIAN,0.0068,
POLISH,0.0061,
NORTHERN_PASHTO,0.0058,
KANNADA,0.0058,
XIANG,0.0058,
MALAYALAM,0.0057,
SUNDANESE,0.0057,
HAUSA,0.0052,
ORIYA,0.0050,
BURMESE,0.005,
HAKKA,0.0046,
UKRAINIAN,0.0046,
BHOJPURI,0.0043,
TAGALOG,0.0042,
YORUBA,0.0042,
MAITHILI,0.0041,
UZBEK,0.0039,
ESPERANTO,0,
SWAHILI,0.0007,
DANISH,0.0007,
LOJBAN,0,
FINNISH,0.0007,
	---
	title: "Linguistic differences"
	output: html_notebook
	---

	Read in all the data. In the working directory, I have put several files from the AJSP database. Any subset of files will do.

	```{r}
	lf=list.files(pattern=".csv")

	#res is going to contain all words in the database. Becase not all Swadeshlists are equally long, I put it at max length 100
	res=matrix(0,nrow=100, ncol=length(lf))
	cnames=rep("", length(lf))#names of the languages
	for(i in 1:length(lf)){
	l=lf[i]

	#still haven't learnt regular expresions:
	name=strsplit( strsplit(l, "-")[[1]][3], ".", fixed=T)[[1]][1]
	cnames[i]=name

	#load file and extrat relevant information.
	ad=read.csv(l)
	res[as.numeric(ad$Parameter_ID),i]=as.character(ad$Value)
	}
	res=data.frame(res)
	colnames(res) <- cnames
	```

	On this dataset we use Levenshtein distance with the length-of-word-correction, LDN (but not LDND, which I am not convinced of).

	```{r}
	library(stringdist)

	#this will be th LDN distance matrix in the nd
	resulting_matrix=matrix(0, nrow=ncol(res), ncol=ncol(res))

	#this is the denominator in the average.
	counter_matrix=resulting_matrix

	#function that calculates the longest word for each entry in the outer product v1%o%v2
	len_f=function(v1,v2){
	res=matrix(0, nrow=length(v1),ncol=length(v2))
	for(i in 1:length(v1)){
	for(j in 1:length(v2)){
	res[i,j]=max(nchar(v1[i]), nchar(v2[j]))
	}
	}
	return(res)
	}

	#calculates the sum and denominator in the average cummulatively for all entries.
	for(i in 1:nrow(res)){

	#x is the translation of a certain term to all languages
	x=as.character(unlist(res[i,]))

	#calculates levenshtein differences and rem the missing values
	to_add=stringdistmatrix(x,x, method="lv")*(x!="0")%o%(x!="0")

	#the len_f(x,x) term is the N in LDN.
	counter_matrix=counter_matrix+len_f(x,x)*(to_add>0)
	resulting_matrix=resulting_matrix+to_add
	}

	#the average is calculated.
	resulting_matrix[counter_matrix>0]=resulting_matrix[counter_matrix>0]/counter_matrix[counter_matrix>0]
	rownames(resulting_matrix) <- cnames
	colnames(resulting_matrix) <- cnames
	print(resulting_matrix[1:7,1:7]) #testing heatmap
	```

	In the following section I run MDS. As seen from

	```{r}
	library(vegan)
	weights=read.csv("speakers.txt")
	weights=weights[order(weights$Language),]#ordering the languages alphabetically like in resultingmatrix

	#the artificially introduced languages
	zero_weights=c("DANISH","ESPERANTO","LOJBAN", "FINNISH")
	weights.v=ifelse(weights$Language%in% zero_weights, 0.01,1)

	#using weighted mds here.
	mds=wcmdscale(resulting_matrix,k=2, w=weights.v)
	df=data.frame(mds)
	colnames(df) <- c("MDS1","MDS2")
	rownames(df) <- cnames
	```

	Here I calculate the loss/appropriateness of each point

	```{r}
	#given a row index in the mds-matrix, this funcion calculates the differene between that row an all other rows, putting it in a vector.
	get_dists=function(i){
	dists=rep(0, nrow(df))
	for(j in 1:nrow(df)){
	dists[j]=sum(df[i,1:2]-df[j,1:2])^2
	}
	return(dists)
	}

	#calculaes the normalised differecen between actual and estimation distance
	loss_f=function(i){
	dists=get_dists(i)
	imp_dists=which(dists>0)
	return(sum((resulting_matrix[imp_dists,i]-dists[imp_dists])^2)/sum(dists[imp_dists]))
	}

	df$loss=sapply(1:nrow(df), loss_f)


	```

	Here I plot the two MDS plots.

	```{r}
	library(ggplot2)

	#some manual data handlng to get the labels and names nicer.
	position_bottom=c("FRENCH","SPANISH","FRENCH","HAKKA","GUJARATI","RUSSIAN", "JAPANESE", "MANDARIN","THAI","VIETNAMESE","OLD_OR_MIDDLE_JAVANESE","BHOJPURI","MAITHILI","NORTHERN_PASHTO","HINDI")
	position_left=c("KOREAN","PUNJABI_MAJHI","HAUSA","KANNADA","TAGALOG","TURKISH","TELUGU","BURMESE","FUZHOU_CHINESE")
	position_right=c("MARATHI","SWAHILI","SUNDANESE","ORIYA","ESPERANTO")
	name=rbind(c("NORTHERN_PASHTO","PASHTO"),
	c("OLD_OR_MIDDLE_JAVANESE","JAVANESE"),
	c("STANDARD_ARABIC","ARABIC"),
	c("STANDARD_GERMAN","GERMAN"),
	c("PUNJABI_MAJHI","PUNJABI"))
	df$position=ifelse(rownames(df)%in%position_bottom, "bottom", "top")
	df$position[df$position=="top"]=ifelse(rownames(df[df$position=="top",])%in%position_left, "left","top")
	df$position[df$position=="top"]=ifelse(rownames(df[df$position=="top",])%in%position_right, "right","top")
	df$wname=rownames(df)
	for(i in 1:nrow(name)){
	df$wname[which(df$wname==name[i,1])]=name[i,2]
	}

	#size of plot window and text.
	h=700
	w=800
	s=5

	#plots all points at once butadds the text differently for the 4 groups of placement relative to the point.
	png(filename = "MDS.png", height = h, width = w)
	p <- ggplot(df, aes(MDS1, MDS2))+geom_point()+
	geom_text(label=df$wname[which(df$position=="top")], size=s, data=subset(df, position=="top"), nudge_y=0.015)+
	geom_text(label=df$wname[which(df$position=="bottom")], size=s, data=subset(df, position=="bottom"), nudge_y=-0.010)+
	geom_text(label=df$wname[which(df$position=="left")], size=s, data=subset(df, position=="left"), nudge_x=-0.03)+
	geom_text(label=df$wname[which(df$position=="right")], size=s, data=subset(df, position=="right"), nudge_x=0.035)
	p
	dev.off()

	png(filename= "MDSfit.png", height = h, width = w)
	p <- ggplot(df, aes(MDS1, MDS2))+geom_point(aes(color=loss))+
	geom_text(label=df$wname[which(df$position=="top")], size=s, data=subset(df, position=="top"), nudge_y=0.015)+
	geom_text(label=df$wname[which(df$position=="bottom")], size=s, data=subset(df, position=="bottom"), nudge_y=-0.010)+
	geom_text(label=df$wname[which(df$position=="left")], size=s, data=subset(df, position=="left"), nudge_x=-0.03)+
	geom_text(label=df$wname[which(df$position=="right")], size=s, data=subset(df, position=="right"), nudge_x=0.035)+ scale_colour_gradient(high = "red", low="green")
	p
	dev.off()



	```

	Plotting the distance matrix.

	```{r}
	res2=resulting_matrix
	colnames(res2) <- df$wname
	rownames(res2) <- colnames(res2)
	png("dendogram.png", height=800, width=800)
	heatmap(res2^2, symm=T)
	```

	In the last section I discover that the average distance between languages in this uncomplete dataset(as in only having a subset of languages) is too influenced by the selection of the languages. So this part is skipped in the article.

	```{r}
	#
	weights=weights[order(weights$Language),]
	score_function=function(x){
	return(sum(weights$native*exp(x)))
	}
	#cnames
	#sort(apply(resulting_matrix,1,score_function))
	sort(apply(resulting_matrix,1,sum))
	```
	Language,native,l2speakers
	MANDARIN,0.1482,
	SPANISH,0.0585,
	ENGLISH,0.052,
	HINDI,0.046,
	STANDARD_ARABIC,0.0423,
	PORTUGUESE,0.0308,
	BENGALI,0.0305,
	RUSSIAN,0.0242,
	JAPANESE,0.0192,
	PUNJABI_MAJHI,0.0144,
	STANDARD_GERMAN,0.0139,
	OLD_OR_MIDDLE_JAVANESE,0.0125,
	SUZHOU_WU,0.0120,
	MALAY,0.0116,
	TELUGU,0.0115,
	VIETNAMESE,0.0114,
	KOREAN,0.0114,
	FRENCH,0.0112,
	MARATHI,0.0110,
	TAMIL,0.0106,
	URDU,0.0099,
	TURKISH,0.0095,
	ITALIAN,0.009,
	CANTONESE,0.0089,
	THAI,0.0085,
	GUJARATI,0.0074,
	FUZHOU_CHINESE,0.0071,
	PERSIAN,0.0068,
	POLISH,0.0061,
	NORTHERN_PASHTO,0.0058,
	KANNADA,0.0058,
	XIANG,0.0058,
	MALAYALAM,0.0057,
	SUNDANESE,0.0057,
	HAUSA,0.0052,
	ORIYA,0.0050,
	BURMESE,0.005,
	HAKKA,0.0046,
	UKRAINIAN,0.0046,
	BHOJPURI,0.0043,
	TAGALOG,0.0042,
	YORUBA,0.0042,
	MAITHILI,0.0041,
	UZBEK,0.0039,
	ESPERANTO,0,
	SWAHILI,0.0007,
	DANISH,0.0007,
	LOJBAN,0,
	FINNISH,0.0007,