Skip to content

Instantly share code, notes, and snippets.

@svendvn
Created February 12, 2017 06:51
Show Gist options
  • Save svendvn/dcd93bc5b2d52560a564ed8b47d84808 to your computer and use it in GitHub Desktop.
Save svendvn/dcd93bc5b2d52560a564ed8b47d84808 to your computer and use it in GitHub Desktop.
---
title: "Linguistic differences"
output: html_notebook
---
Read in all the data. In the working directory, I have put several files from the AJSP database. Any subset of files will do.
```{r}
lf=list.files(pattern=".csv")
#res is going to contain all words in the database. Becase not all Swadeshlists are equally long, I put it at max length 100
res=matrix(0,nrow=100, ncol=length(lf))
cnames=rep("", length(lf))#names of the languages
for(i in 1:length(lf)){
l=lf[i]
#still haven't learnt regular expresions:
name=strsplit( strsplit(l, "-")[[1]][3], ".", fixed=T)[[1]][1]
cnames[i]=name
#load file and extrat relevant information.
ad=read.csv(l)
res[as.numeric(ad$Parameter_ID),i]=as.character(ad$Value)
}
res=data.frame(res)
colnames(res) <- cnames
```
On this dataset we use Levenshtein distance with the length-of-word-correction, LDN (but not LDND, which I am not convinced of).
```{r}
library(stringdist)
#this will be th LDN distance matrix in the nd
resulting_matrix=matrix(0, nrow=ncol(res), ncol=ncol(res))
#this is the denominator in the average.
counter_matrix=resulting_matrix
#function that calculates the longest word for each entry in the outer product v1%o%v2
len_f=function(v1,v2){
res=matrix(0, nrow=length(v1),ncol=length(v2))
for(i in 1:length(v1)){
for(j in 1:length(v2)){
res[i,j]=max(nchar(v1[i]), nchar(v2[j]))
}
}
return(res)
}
#calculates the sum and denominator in the average cummulatively for all entries.
for(i in 1:nrow(res)){
#x is the translation of a certain term to all languages
x=as.character(unlist(res[i,]))
#calculates levenshtein differences and rem the missing values
to_add=stringdistmatrix(x,x, method="lv")*(x!="0")%o%(x!="0")
#the len_f(x,x) term is the N in LDN.
counter_matrix=counter_matrix+len_f(x,x)*(to_add>0)
resulting_matrix=resulting_matrix+to_add
}
#the average is calculated.
resulting_matrix[counter_matrix>0]=resulting_matrix[counter_matrix>0]/counter_matrix[counter_matrix>0]
rownames(resulting_matrix) <- cnames
colnames(resulting_matrix) <- cnames
print(resulting_matrix[1:7,1:7]) #testing heatmap
```
In the following section I run MDS. As seen from
```{r}
library(vegan)
weights=read.csv("speakers.txt")
weights=weights[order(weights$Language),]#ordering the languages alphabetically like in resultingmatrix
#the artificially introduced languages
zero_weights=c("DANISH","ESPERANTO","LOJBAN", "FINNISH")
weights.v=ifelse(weights$Language%in% zero_weights, 0.01,1)
#using weighted mds here.
mds=wcmdscale(resulting_matrix,k=2, w=weights.v)
df=data.frame(mds)
colnames(df) <- c("MDS1","MDS2")
rownames(df) <- cnames
```
Here I calculate the loss/appropriateness of each point
```{r}
#given a row index in the mds-matrix, this funcion calculates the differene between that row an all other rows, putting it in a vector.
get_dists=function(i){
dists=rep(0, nrow(df))
for(j in 1:nrow(df)){
dists[j]=sum(df[i,1:2]-df[j,1:2])^2
}
return(dists)
}
#calculaes the normalised differecen between actual and estimation distance
loss_f=function(i){
dists=get_dists(i)
imp_dists=which(dists>0)
return(sum((resulting_matrix[imp_dists,i]-dists[imp_dists])^2)/sum(dists[imp_dists]))
}
df$loss=sapply(1:nrow(df), loss_f)
```
Here I plot the two MDS plots.
```{r}
library(ggplot2)
#some manual data handlng to get the labels and names nicer.
position_bottom=c("FRENCH","SPANISH","FRENCH","HAKKA","GUJARATI","RUSSIAN", "JAPANESE", "MANDARIN","THAI","VIETNAMESE","OLD_OR_MIDDLE_JAVANESE","BHOJPURI","MAITHILI","NORTHERN_PASHTO","HINDI")
position_left=c("KOREAN","PUNJABI_MAJHI","HAUSA","KANNADA","TAGALOG","TURKISH","TELUGU","BURMESE","FUZHOU_CHINESE")
position_right=c("MARATHI","SWAHILI","SUNDANESE","ORIYA","ESPERANTO")
name=rbind(c("NORTHERN_PASHTO","PASHTO"),
c("OLD_OR_MIDDLE_JAVANESE","JAVANESE"),
c("STANDARD_ARABIC","ARABIC"),
c("STANDARD_GERMAN","GERMAN"),
c("PUNJABI_MAJHI","PUNJABI"))
df$position=ifelse(rownames(df)%in%position_bottom, "bottom", "top")
df$position[df$position=="top"]=ifelse(rownames(df[df$position=="top",])%in%position_left, "left","top")
df$position[df$position=="top"]=ifelse(rownames(df[df$position=="top",])%in%position_right, "right","top")
df$wname=rownames(df)
for(i in 1:nrow(name)){
df$wname[which(df$wname==name[i,1])]=name[i,2]
}
#size of plot window and text.
h=700
w=800
s=5
#plots all points at once butadds the text differently for the 4 groups of placement relative to the point.
png(filename = "MDS.png", height = h, width = w)
p <- ggplot(df, aes(MDS1, MDS2))+geom_point()+
geom_text(label=df$wname[which(df$position=="top")], size=s, data=subset(df, position=="top"), nudge_y=0.015)+
geom_text(label=df$wname[which(df$position=="bottom")], size=s, data=subset(df, position=="bottom"), nudge_y=-0.010)+
geom_text(label=df$wname[which(df$position=="left")], size=s, data=subset(df, position=="left"), nudge_x=-0.03)+
geom_text(label=df$wname[which(df$position=="right")], size=s, data=subset(df, position=="right"), nudge_x=0.035)
p
dev.off()
png(filename= "MDSfit.png", height = h, width = w)
p <- ggplot(df, aes(MDS1, MDS2))+geom_point(aes(color=loss))+
geom_text(label=df$wname[which(df$position=="top")], size=s, data=subset(df, position=="top"), nudge_y=0.015)+
geom_text(label=df$wname[which(df$position=="bottom")], size=s, data=subset(df, position=="bottom"), nudge_y=-0.010)+
geom_text(label=df$wname[which(df$position=="left")], size=s, data=subset(df, position=="left"), nudge_x=-0.03)+
geom_text(label=df$wname[which(df$position=="right")], size=s, data=subset(df, position=="right"), nudge_x=0.035)+ scale_colour_gradient(high = "red", low="green")
p
dev.off()
```
Plotting the distance matrix.
```{r}
res2=resulting_matrix
colnames(res2) <- df$wname
rownames(res2) <- colnames(res2)
png("dendogram.png", height=800, width=800)
heatmap(res2^2, symm=T)
```
In the last section I discover that the average distance between languages in this uncomplete dataset(as in only having a subset of languages) is too influenced by the selection of the languages. So this part is skipped in the article.
```{r}
#
weights=weights[order(weights$Language),]
score_function=function(x){
return(sum(weights$native*exp(x)))
}
#cnames
#sort(apply(resulting_matrix,1,score_function))
sort(apply(resulting_matrix,1,sum))
```
Language,native,l2speakers
MANDARIN,0.1482,
SPANISH,0.0585,
ENGLISH,0.052,
HINDI,0.046,
STANDARD_ARABIC,0.0423,
PORTUGUESE,0.0308,
BENGALI,0.0305,
RUSSIAN,0.0242,
JAPANESE,0.0192,
PUNJABI_MAJHI,0.0144,
STANDARD_GERMAN,0.0139,
OLD_OR_MIDDLE_JAVANESE,0.0125,
SUZHOU_WU,0.0120,
MALAY,0.0116,
TELUGU,0.0115,
VIETNAMESE,0.0114,
KOREAN,0.0114,
FRENCH,0.0112,
MARATHI,0.0110,
TAMIL,0.0106,
URDU,0.0099,
TURKISH,0.0095,
ITALIAN,0.009,
CANTONESE,0.0089,
THAI,0.0085,
GUJARATI,0.0074,
FUZHOU_CHINESE,0.0071,
PERSIAN,0.0068,
POLISH,0.0061,
NORTHERN_PASHTO,0.0058,
KANNADA,0.0058,
XIANG,0.0058,
MALAYALAM,0.0057,
SUNDANESE,0.0057,
HAUSA,0.0052,
ORIYA,0.0050,
BURMESE,0.005,
HAKKA,0.0046,
UKRAINIAN,0.0046,
BHOJPURI,0.0043,
TAGALOG,0.0042,
YORUBA,0.0042,
MAITHILI,0.0041,
UZBEK,0.0039,
ESPERANTO,0,
SWAHILI,0.0007,
DANISH,0.0007,
LOJBAN,0,
FINNISH,0.0007,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment