msaby/Analyse de theses.fr par B Coulmont

## Analyse de theses.fr par B Coulmont
# theses point fr
# rvest

library(RCurl)
library(tidyverse)
library(lubridate)
library(scales)
library(hrbrthemes)
setwd("~/Desktop/")

# Ici on indique le nombre de thèse en ligne,
# divisé par 1000 en arrondissant au millier supérieur.
# (theses.fr pagine par tranches de 1000)
i <- 1:88
i <- i*1000
URL <-paste0("http://www.theses.fr/?q=&fq=dateSoutenance:[2015-01-01T23:59:59Z%2BTO%2B2018-12-31T23:59:59Z]&checkedfacets=&start=",i,"&sort=none&status=&access=&prevision=&filtrepersonne=&zone1=titreRAs&val1=&op1=AND&zone2=auteurs&val2=&op2=AND&zone3=etabSoutenances&val3=&zone4=dateSoutenance&val4a=&val4b=&type=&lng=&checkedfacets=&format=csv")
# on cherche les thèses soutenues après 2015
# il y a environ 86960 thèses (au 1/12/2018)


j <-1
SERP <- 1

for(j in 1:length(URL)){
  SERP[j] <- getURL(URL[j])
  write.csv(SERP,"SERP_2.csv",append=F)
}
rm(SERP,i,j,URL)

theses <- read.csv("~/Desktop/SERP_2.csv",sep=";",quote="",skip=1,stringsAsFactors = F)

# on garde les thèses en "sociologie" et sciences sociales


socio <- theses %>% filter(grepl("sociologie|Sciences sociales|sc sociales",X..Discipline..,ignore.case=T)) %>%
  filter(X..Date.de.soutenance..!="") %>%
  filter(X..Statut..=="soutenue")

# dans "socio" on a maintenant 618 identifiants de thèse
# ce sont ces identifiants qui font servir de base
# au scrapping des jurys

library(rvest)
identifiants <- socio$X..Identifiant.de.la.these..
reseau_total <- data_frame(noms_jury="",
                           liens_jury="",
                           these="",
                           directeurs="",
                           liens_directeurs="")
for (i in 1:length(identifiants)) {

  data_theses_socio <- read_html( paste("http://www.theses.fr/",identifiants[i],sep="") )
  #directeurs :
  directeurs <- bind_cols(
    directeurs= data_theses_socio  %>%
      html_nodes("div .donnees-ombre p") %>%
      .[[1]] %>%
      html_nodes("a") %>%
      html_text()
    ,
    liens_directeurs = data_theses_socio  %>%
      html_nodes("div .donnees-ombre p") %>%
      .[[1]] %>%
      html_nodes("a") %>%
      html_attr(name="href")
  ) %>% mutate(  these = identifiants[i] )


  jury <- bind_cols(
    noms_jury = data_theses_socio %>%
      html_nodes("div .donnees p a") %>%
      html_text()
    ,
    liens_jury = data_theses_socio %>%
      html_nodes("div .donnees p a") %>%
      html_attr(name="href")
  ) %>% mutate(  these = identifiants[i] )

  reseau = jury %>% left_join(directeurs,by="these")

  reseau_total <- bind_rows(reseau_total,reseau)
}

write.csv2(reseau_total,"~/Desktop/reseau-theses-2015-2018.csv")

reseau_total <- read.csv2("~/Desktop/reseau-theses-2015-2018.csv",stringsAsFactors = F)

library(igraph)
library(ggraph)
library(ggrepel)

# composer le réseau des relations de jury à partir de reseau_total
# règles de composition - ponderation
#     - co-direction        = 3
#     - lien directeur-jury = 2
#     - lien jury-jury      = 1
# règles de composition : direction des liens
#     - direction     --> jury
#     - co-direction <--> co-direction
#     - jury         <--> jury


directions_theses <- reseau_total %>% select(these,directeurs)
directions_theses <- directions_theses %>% unique()
directions_theses <-  directions_theses %>% group_by(these) %>% mutate(N=n()) %>%
  filter(N==2) %>% # on ne garde que les codirections avec 2 directeurs
  mutate(rang=rank(directeurs))
directions_theses <- directions_theses %>%
  spread(key=rang,value=directeurs)
directions_theses <- directions_theses %>% ungroup() %>% select(nom1=`1`,nom2=`2`)
directions_theses <- directions_theses %>% mutate(poids=3)

directions_jury <- reseau_total %>% select(nom1=noms_jury,nom2=directeurs) %>% filter( nom1 != "")
directions_jury <- directions_jury %>% mutate(poids=2)
directions_jury <- directions_jury %>% group_by(nom1,nom2) %>% summarize(poids=sum(poids))


jury_jury <- reseau_total %>% select(noms_jury,these) %>% unique() %>% filter(noms_jury!="")
g_j <-  graph_from_data_frame(jury_jury,directed=F)
V(g_j)$type <- V(g_j)$name %in% jury_jury$noms_jury
g_j_1 <- bipartite_projection(g_j,which="true")
jurys <- as_long_data_frame(g_j_1) %>%
  select(nom1=`ver[el[, 1], ]`, nom2=`ver2[el[, 2], ]`, poids=weight)

#jury_jury <- jury_jury %>% group_by(these) %>% mutate(rang=rank(noms_jury)) %>%
#  spread(key=rang,value=noms_jury)


reseau_petit <- bind_rows(directions_theses,directions_jury,jurys)
reseau_petit <- reseau_petit %>% group_by(nom1,nom2) %>% summarize(poids=sum(poids))


#nombre_jury <- reseau_total %>% group_by(noms_jury) %>% summarize(N=n()) %>% filter(N>2)
#nombre_direction <- reseau_total %>% group_by(directeurs) %>% summarize(N=n()) %>% filter(N>6)

#reseau_petit <- reseau_total %>% filter(noms_jury %in% nombre_jury$noms_jury) %>%
#  filter(directeurs %in% nombre_direction$directeurs)

#reseau_petit <- reseau_petit %>% mutate(directeurs = case_when(directeurs==" Michel Wieviorka"~ "Michel Wieviorka",
#                                                               TRUE ~ directeurs))
#g <-  graph_from_data_frame( reseau_petit %>% select(noms_jury,directeurs) %>% filter( noms_jury != ""), directed = F)
g <- graph_from_data_frame(reseau_petit, directed=F)
g <- simplify(g,edge.attr.comb = sum)
V(g)$degres <-  degree(g)
V(g)$label <- gsub("^\\S+\\s+(.+)$","\\1",V(g)$name)
V(g)$communaute <- as.character(cluster_walktrap(g, steps=15)$membership)
#V(g)$communaute <- as.character(cluster_edge_betweenness(g)$membership)
#g <- delete_edges(g, which(E(g)$poids<3) )
V(g)$closeness <- (5*closeness(g))^10
V(g)$btwns <- betweenness(g)
V(g)$eigen_centr <- eigen_centrality(g)$vector
#g <- induced_subgraph(g, V(g)$degres>8)
g <- delete_edges(g, which(E(g)$poids<5) )
V(g)$cluster_number <- clusters(g)$membership
g <- induced_subgraph(g, V(g)$cluster_number== which( max(clusters(g)$csize) == clusters(g)$csize) ) # on garde le gros cluster
E(g)$weight <- 1/E(g)$poids
#plot(g)
V(g)$label <- ifelse(V(g)$degres<20,"",V(g)$label)
#V(g)$communaute <- as.character(cluster_walktrap(g, steps=15)$membership)
V(g)$communaute <- as.character(cluster_walktrap(g, steps=4)$membership)
ggraph(g,layout="igraph",algorithm="nicely") +
  geom_edge_link(aes(width=.1*poids), alpha=.1,
                 end_cap = circle(0, 'mm'),
                 start_cap = circle(0, 'mm')) +
  #  geom_node_point(aes(size=eigen_centr),color="lightblue4") +
  #  geom_node_point(color="lightblue",size=9) +
  geom_node_point(aes(size=btwns), color="white",alpha=1) +
  geom_node_point(aes(color=communaute,size=btwns), alpha=.5) +
  scale_size_area(max_size = 20) +
  geom_node_text(aes(label=label),size=3,repel=T,box.padding = 0.15) +
  #  scale_size_continuous(range = c(1, 6)) +
  labs(title="Réseaux de sociologues",
       subtitle="Soutenances de thèses entre 2015 et 2018",
       caption="Sources : theses.fr - Réalisation B. Coulmont") +
  theme_graph(foreground = 'white', fg_text_colour = 'white',
              base_family = "Helvetica") +
  theme(legend.position="none",
        text=element_text(size=16,family="Helvetica"),
        plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), units="line"))

ggsave(filename = "test-reseau.pdf",width = 20,height = 20)
ggsave(filename = "test-reseau.png",width = 10,height = 10)


ggraph(g,layout="igraph",algorithm="fr") +
  geom_edge_link(aes(width=.1*poids), alpha=.1,
                 end_cap = circle(5, 'mm'),
                 start_cap = circle(5, 'mm')) +
#  geom_node_point(aes(size=eigen_centr),color="lightblue4") +
#  geom_node_point(color="lightblue",size=9) +
  geom_node_point(aes(size=eigen_centr), color="white",alpha=1) +
  geom_node_point(aes(color=communaute,size=eigen_centr), alpha=.5) +
  scale_size_area(max_size = 20) +
  geom_node_text(aes(label=label),size=3,repel=T,box.padding = 0.15) +
#  scale_size_continuous(range = c(1, 6)) +
  labs(title="Réseaux de sociologues",
       subtitle="Soutenances de thèses entre 2015 et 2018",
       caption="Sources : theses.fr - Réalisation B. Coulmont") +
  theme_graph(foreground = 'white', fg_text_colour = 'white',
              base_family = "Helvetica") +
  theme(legend.position="none",
        text=element_text(size=16,family="Helvetica"),
        plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), units="line"))

ggsave(filename = "test-reseau.pdf",width=20,height = 20)


############
#
#
#' BROUILLON
#'
#'


#### Jury
jury <- bind_cols(
noms=read_html("http://www.theses.fr/2015TOU20114") %>%
  html_nodes("div .donnees p a") %>%

  html_text()
,
liens=read_html("http://www.theses.fr/2015TOU20114") %>%
  html_nodes("div .donnees p a") %>%
  html_attr(name="href")
)

http://www.theses.fr/2017BORD0902

jury <- bind_cols(
  noms=read_html("http://www.theses.fr/2015BORD0025") %>%
    html_nodes("div .donnees p a") %>%
    html_text()
  ,
  liens=read_html("http://www.theses.fr/2017BORD0902") %>%
    html_nodes("div .donnees p a") %>%
    html_attr(name="href")
)


read_html("http://www.theses.fr/2015BORD0025") %>%
  html_nodes("div .donnees p ") %>%
  html_text()
read_html("http://www.theses.fr/2015BORD0025") %>%
  html_nodes("div .donnees p a") %>%
  html_attr(name="href")


read_html("http://www.theses.fr/2015BORD0025") %>%
  html_nodes("div .donnees p span ")
	# theses point fr
	# rvest

	library(RCurl)
	library(tidyverse)
	library(lubridate)
	library(scales)
	library(hrbrthemes)
	setwd("~/Desktop/")

	# Ici on indique le nombre de thèse en ligne,
	# divisé par 1000 en arrondissant au millier supérieur.
	# (theses.fr pagine par tranches de 1000)
	i <- 1:88
	i <- i*1000
	URL <-paste0("http://www.theses.fr/?q=&fq=dateSoutenance:[2015-01-01T23:59:59Z%2BTO%2B2018-12-31T23:59:59Z]&checkedfacets=&start=",i,"&sort=none&status=&access=&prevision=&filtrepersonne=&zone1=titreRAs&val1=&op1=AND&zone2=auteurs&val2=&op2=AND&zone3=etabSoutenances&val3=&zone4=dateSoutenance&val4a=&val4b=&type=&lng=&checkedfacets=&format=csv")
	# on cherche les thèses soutenues après 2015
	# il y a environ 86960 thèses (au 1/12/2018)


	j <-1
	SERP <- 1

	for(j in 1:length(URL)){
	SERP[j] <- getURL(URL[j])
	write.csv(SERP,"SERP_2.csv",append=F)
	}
	rm(SERP,i,j,URL)

	theses <- read.csv("~/Desktop/SERP_2.csv",sep=";",quote="",skip=1,stringsAsFactors = F)

	# on garde les thèses en "sociologie" et sciences sociales


	socio <- theses %>% filter(grepl("sociologie\|Sciences sociales\|sc sociales",X..Discipline..,ignore.case=T)) %>%
	filter(X..Date.de.soutenance..!="") %>%
	filter(X..Statut..=="soutenue")

	# dans "socio" on a maintenant 618 identifiants de thèse
	# ce sont ces identifiants qui font servir de base
	# au scrapping des jurys

	library(rvest)
	identifiants <- socio$X..Identifiant.de.la.these..
	reseau_total <- data_frame(noms_jury="",
	liens_jury="",
	these="",
	directeurs="",
	liens_directeurs="")
	for (i in 1:length(identifiants)) {

	data_theses_socio <- read_html( paste("http://www.theses.fr/",identifiants[i],sep="") )
	#directeurs :
	directeurs <- bind_cols(
	directeurs= data_theses_socio %>%
	html_nodes("div .donnees-ombre p") %>%
	.[[1]] %>%
	html_nodes("a") %>%
	html_text()
	,
	liens_directeurs = data_theses_socio %>%
	html_nodes("div .donnees-ombre p") %>%
	.[[1]] %>%
	html_nodes("a") %>%
	html_attr(name="href")
	) %>% mutate( these = identifiants[i] )



	jury <- bind_cols(
	noms_jury = data_theses_socio %>%
	html_nodes("div .donnees p a") %>%
	html_text()
	,
	liens_jury = data_theses_socio %>%
	html_nodes("div .donnees p a") %>%
	html_attr(name="href")
	) %>% mutate( these = identifiants[i] )

	reseau = jury %>% left_join(directeurs,by="these")

	reseau_total <- bind_rows(reseau_total,reseau)
	}

	write.csv2(reseau_total,"~/Desktop/reseau-theses-2015-2018.csv")

	reseau_total <- read.csv2("~/Desktop/reseau-theses-2015-2018.csv",stringsAsFactors = F)

	library(igraph)
	library(ggraph)
	library(ggrepel)

	# composer le réseau des relations de jury à partir de reseau_total
	# règles de composition - ponderation
	# - co-direction = 3
	# - lien directeur-jury = 2
	# - lien jury-jury = 1
	# règles de composition : direction des liens
	# - direction --> jury
	# - co-direction <--> co-direction
	# - jury <--> jury


	directions_theses <- reseau_total %>% select(these,directeurs)
	directions_theses <- directions_theses %>% unique()
	directions_theses <- directions_theses %>% group_by(these) %>% mutate(N=n()) %>%
	filter(N==2) %>% # on ne garde que les codirections avec 2 directeurs
	mutate(rang=rank(directeurs))
	directions_theses <- directions_theses %>%
	spread(key=rang,value=directeurs)
	directions_theses <- directions_theses %>% ungroup() %>% select(nom1=`1`,nom2=`2`)
	directions_theses <- directions_theses %>% mutate(poids=3)

	directions_jury <- reseau_total %>% select(nom1=noms_jury,nom2=directeurs) %>% filter( nom1 != "")
	directions_jury <- directions_jury %>% mutate(poids=2)
	directions_jury <- directions_jury %>% group_by(nom1,nom2) %>% summarize(poids=sum(poids))


	jury_jury <- reseau_total %>% select(noms_jury,these) %>% unique() %>% filter(noms_jury!="")
	g_j <- graph_from_data_frame(jury_jury,directed=F)
	V(g_j)$type <- V(g_j)$name %in% jury_jury$noms_jury
	g_j_1 <- bipartite_projection(g_j,which="true")
	jurys <- as_long_data_frame(g_j_1) %>%
	select(nom1=`ver[el[, 1], ]`, nom2=`ver2[el[, 2], ]`, poids=weight)

	#jury_jury <- jury_jury %>% group_by(these) %>% mutate(rang=rank(noms_jury)) %>%
	# spread(key=rang,value=noms_jury)





	reseau_petit <- bind_rows(directions_theses,directions_jury,jurys)
	reseau_petit <- reseau_petit %>% group_by(nom1,nom2) %>% summarize(poids=sum(poids))



	#nombre_jury <- reseau_total %>% group_by(noms_jury) %>% summarize(N=n()) %>% filter(N>2)
	#nombre_direction <- reseau_total %>% group_by(directeurs) %>% summarize(N=n()) %>% filter(N>6)

	#reseau_petit <- reseau_total %>% filter(noms_jury %in% nombre_jury$noms_jury) %>%
	# filter(directeurs %in% nombre_direction$directeurs)

	#reseau_petit <- reseau_petit %>% mutate(directeurs = case_when(directeurs==" Michel Wieviorka"~ "Michel Wieviorka",
	# TRUE ~ directeurs))
	#g <- graph_from_data_frame( reseau_petit %>% select(noms_jury,directeurs) %>% filter( noms_jury != ""), directed = F)
	g <- graph_from_data_frame(reseau_petit, directed=F)
	g <- simplify(g,edge.attr.comb = sum)
	V(g)$degres <- degree(g)
	V(g)$label <- gsub("^\\S+\\s+(.+)$","\\1",V(g)$name)
	V(g)$communaute <- as.character(cluster_walktrap(g, steps=15)$membership)
	#V(g)$communaute <- as.character(cluster_edge_betweenness(g)$membership)
	#g <- delete_edges(g, which(E(g)$poids<3) )
	V(g)$closeness <- (5*closeness(g))^10
	V(g)$btwns <- betweenness(g)
	V(g)$eigen_centr <- eigen_centrality(g)$vector
	#g <- induced_subgraph(g, V(g)$degres>8)
	g <- delete_edges(g, which(E(g)$poids<5) )
	V(g)$cluster_number <- clusters(g)$membership
	g <- induced_subgraph(g, V(g)$cluster_number== which( max(clusters(g)$csize) == clusters(g)$csize) ) # on garde le gros cluster
	E(g)$weight <- 1/E(g)$poids
	#plot(g)
	V(g)$label <- ifelse(V(g)$degres<20,"",V(g)$label)
	#V(g)$communaute <- as.character(cluster_walktrap(g, steps=15)$membership)
	V(g)$communaute <- as.character(cluster_walktrap(g, steps=4)$membership)
	ggraph(g,layout="igraph",algorithm="nicely") +
	geom_edge_link(aes(width=.1*poids), alpha=.1,
	end_cap = circle(0, 'mm'),
	start_cap = circle(0, 'mm')) +
	# geom_node_point(aes(size=eigen_centr),color="lightblue4") +
	# geom_node_point(color="lightblue",size=9) +
	geom_node_point(aes(size=btwns), color="white",alpha=1) +
	geom_node_point(aes(color=communaute,size=btwns), alpha=.5) +
	scale_size_area(max_size = 20) +
	geom_node_text(aes(label=label),size=3,repel=T,box.padding = 0.15) +
	# scale_size_continuous(range = c(1, 6)) +
	labs(title="Réseaux de sociologues",
	subtitle="Soutenances de thèses entre 2015 et 2018",
	caption="Sources : theses.fr - Réalisation B. Coulmont") +
	theme_graph(foreground = 'white', fg_text_colour = 'white',
	base_family = "Helvetica") +
	theme(legend.position="none",
	text=element_text(size=16,family="Helvetica"),
	plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), units="line"))

	ggsave(filename = "test-reseau.pdf",width = 20,height = 20)
	ggsave(filename = "test-reseau.png",width = 10,height = 10)


	ggraph(g,layout="igraph",algorithm="fr") +
	geom_edge_link(aes(width=.1*poids), alpha=.1,
	end_cap = circle(5, 'mm'),
	start_cap = circle(5, 'mm')) +
	# geom_node_point(aes(size=eigen_centr),color="lightblue4") +
	# geom_node_point(color="lightblue",size=9) +
	geom_node_point(aes(size=eigen_centr), color="white",alpha=1) +
	geom_node_point(aes(color=communaute,size=eigen_centr), alpha=.5) +
	scale_size_area(max_size = 20) +
	geom_node_text(aes(label=label),size=3,repel=T,box.padding = 0.15) +
	# scale_size_continuous(range = c(1, 6)) +
	labs(title="Réseaux de sociologues",
	subtitle="Soutenances de thèses entre 2015 et 2018",
	caption="Sources : theses.fr - Réalisation B. Coulmont") +
	theme_graph(foreground = 'white', fg_text_colour = 'white',
	base_family = "Helvetica") +
	theme(legend.position="none",
	text=element_text(size=16,family="Helvetica"),
	plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), units="line"))

	ggsave(filename = "test-reseau.pdf",width=20,height = 20)



	############
	#
	#
	#' BROUILLON
	#'
	#'


	#### Jury
	jury <- bind_cols(
	noms=read_html("http://www.theses.fr/2015TOU20114") %>%
	html_nodes("div .donnees p a") %>%

	html_text()
	,
	liens=read_html("http://www.theses.fr/2015TOU20114") %>%
	html_nodes("div .donnees p a") %>%
	html_attr(name="href")
	)

	http://www.theses.fr/2017BORD0902

	jury <- bind_cols(
	noms=read_html("http://www.theses.fr/2015BORD0025") %>%
	html_nodes("div .donnees p a") %>%
	html_text()
	,
	liens=read_html("http://www.theses.fr/2017BORD0902") %>%
	html_nodes("div .donnees p a") %>%
	html_attr(name="href")
	)


	read_html("http://www.theses.fr/2015BORD0025") %>%
	html_nodes("div .donnees p ") %>%
	html_text()
	read_html("http://www.theses.fr/2015BORD0025") %>%
	html_nodes("div .donnees p a") %>%
	html_attr(name="href")


	read_html("http://www.theses.fr/2015BORD0025") %>%
	html_nodes("div .donnees p span ")