Skip to content

Instantly share code, notes, and snippets.

@msaby
Created December 3, 2018 16:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save msaby/54f5016e6dc0ca3cf0a8f3de36a1c1c3 to your computer and use it in GitHub Desktop.
Save msaby/54f5016e6dc0ca3cf0a8f3de36a1c1c3 to your computer and use it in GitHub Desktop.
# theses point fr
# rvest
library(RCurl)
library(tidyverse)
library(lubridate)
library(scales)
library(hrbrthemes)
setwd("~/Desktop/")
# Ici on indique le nombre de thèse en ligne,
# divisé par 1000 en arrondissant au millier supérieur.
# (theses.fr pagine par tranches de 1000)
i <- 1:88
i <- i*1000
URL <-paste0("http://www.theses.fr/?q=&fq=dateSoutenance:[2015-01-01T23:59:59Z%2BTO%2B2018-12-31T23:59:59Z]&checkedfacets=&start=",i,"&sort=none&status=&access=&prevision=&filtrepersonne=&zone1=titreRAs&val1=&op1=AND&zone2=auteurs&val2=&op2=AND&zone3=etabSoutenances&val3=&zone4=dateSoutenance&val4a=&val4b=&type=&lng=&checkedfacets=&format=csv")
# on cherche les thèses soutenues après 2015
# il y a environ 86960 thèses (au 1/12/2018)
j <-1
SERP <- 1
for(j in 1:length(URL)){
SERP[j] <- getURL(URL[j])
write.csv(SERP,"SERP_2.csv",append=F)
}
rm(SERP,i,j,URL)
theses <- read.csv("~/Desktop/SERP_2.csv",sep=";",quote="",skip=1,stringsAsFactors = F)
# on garde les thèses en "sociologie" et sciences sociales
socio <- theses %>% filter(grepl("sociologie|Sciences sociales|sc sociales",X..Discipline..,ignore.case=T)) %>%
filter(X..Date.de.soutenance..!="") %>%
filter(X..Statut..=="soutenue")
# dans "socio" on a maintenant 618 identifiants de thèse
# ce sont ces identifiants qui font servir de base
# au scrapping des jurys
library(rvest)
identifiants <- socio$X..Identifiant.de.la.these..
reseau_total <- data_frame(noms_jury="",
liens_jury="",
these="",
directeurs="",
liens_directeurs="")
for (i in 1:length(identifiants)) {
data_theses_socio <- read_html( paste("http://www.theses.fr/",identifiants[i],sep="") )
#directeurs :
directeurs <- bind_cols(
directeurs= data_theses_socio %>%
html_nodes("div .donnees-ombre p") %>%
.[[1]] %>%
html_nodes("a") %>%
html_text()
,
liens_directeurs = data_theses_socio %>%
html_nodes("div .donnees-ombre p") %>%
.[[1]] %>%
html_nodes("a") %>%
html_attr(name="href")
) %>% mutate( these = identifiants[i] )
jury <- bind_cols(
noms_jury = data_theses_socio %>%
html_nodes("div .donnees p a") %>%
html_text()
,
liens_jury = data_theses_socio %>%
html_nodes("div .donnees p a") %>%
html_attr(name="href")
) %>% mutate( these = identifiants[i] )
reseau = jury %>% left_join(directeurs,by="these")
reseau_total <- bind_rows(reseau_total,reseau)
}
write.csv2(reseau_total,"~/Desktop/reseau-theses-2015-2018.csv")
reseau_total <- read.csv2("~/Desktop/reseau-theses-2015-2018.csv",stringsAsFactors = F)
library(igraph)
library(ggraph)
library(ggrepel)
# composer le réseau des relations de jury à partir de reseau_total
# règles de composition - ponderation
# - co-direction = 3
# - lien directeur-jury = 2
# - lien jury-jury = 1
# règles de composition : direction des liens
# - direction --> jury
# - co-direction <--> co-direction
# - jury <--> jury
directions_theses <- reseau_total %>% select(these,directeurs)
directions_theses <- directions_theses %>% unique()
directions_theses <- directions_theses %>% group_by(these) %>% mutate(N=n()) %>%
filter(N==2) %>% # on ne garde que les codirections avec 2 directeurs
mutate(rang=rank(directeurs))
directions_theses <- directions_theses %>%
spread(key=rang,value=directeurs)
directions_theses <- directions_theses %>% ungroup() %>% select(nom1=`1`,nom2=`2`)
directions_theses <- directions_theses %>% mutate(poids=3)
directions_jury <- reseau_total %>% select(nom1=noms_jury,nom2=directeurs) %>% filter( nom1 != "")
directions_jury <- directions_jury %>% mutate(poids=2)
directions_jury <- directions_jury %>% group_by(nom1,nom2) %>% summarize(poids=sum(poids))
jury_jury <- reseau_total %>% select(noms_jury,these) %>% unique() %>% filter(noms_jury!="")
g_j <- graph_from_data_frame(jury_jury,directed=F)
V(g_j)$type <- V(g_j)$name %in% jury_jury$noms_jury
g_j_1 <- bipartite_projection(g_j,which="true")
jurys <- as_long_data_frame(g_j_1) %>%
select(nom1=`ver[el[, 1], ]`, nom2=`ver2[el[, 2], ]`, poids=weight)
#jury_jury <- jury_jury %>% group_by(these) %>% mutate(rang=rank(noms_jury)) %>%
# spread(key=rang,value=noms_jury)
reseau_petit <- bind_rows(directions_theses,directions_jury,jurys)
reseau_petit <- reseau_petit %>% group_by(nom1,nom2) %>% summarize(poids=sum(poids))
#nombre_jury <- reseau_total %>% group_by(noms_jury) %>% summarize(N=n()) %>% filter(N>2)
#nombre_direction <- reseau_total %>% group_by(directeurs) %>% summarize(N=n()) %>% filter(N>6)
#reseau_petit <- reseau_total %>% filter(noms_jury %in% nombre_jury$noms_jury) %>%
# filter(directeurs %in% nombre_direction$directeurs)
#reseau_petit <- reseau_petit %>% mutate(directeurs = case_when(directeurs==" Michel Wieviorka"~ "Michel Wieviorka",
# TRUE ~ directeurs))
#g <- graph_from_data_frame( reseau_petit %>% select(noms_jury,directeurs) %>% filter( noms_jury != ""), directed = F)
g <- graph_from_data_frame(reseau_petit, directed=F)
g <- simplify(g,edge.attr.comb = sum)
V(g)$degres <- degree(g)
V(g)$label <- gsub("^\\S+\\s+(.+)$","\\1",V(g)$name)
V(g)$communaute <- as.character(cluster_walktrap(g, steps=15)$membership)
#V(g)$communaute <- as.character(cluster_edge_betweenness(g)$membership)
#g <- delete_edges(g, which(E(g)$poids<3) )
V(g)$closeness <- (5*closeness(g))^10
V(g)$btwns <- betweenness(g)
V(g)$eigen_centr <- eigen_centrality(g)$vector
#g <- induced_subgraph(g, V(g)$degres>8)
g <- delete_edges(g, which(E(g)$poids<5) )
V(g)$cluster_number <- clusters(g)$membership
g <- induced_subgraph(g, V(g)$cluster_number== which( max(clusters(g)$csize) == clusters(g)$csize) ) # on garde le gros cluster
E(g)$weight <- 1/E(g)$poids
#plot(g)
V(g)$label <- ifelse(V(g)$degres<20,"",V(g)$label)
#V(g)$communaute <- as.character(cluster_walktrap(g, steps=15)$membership)
V(g)$communaute <- as.character(cluster_walktrap(g, steps=4)$membership)
ggraph(g,layout="igraph",algorithm="nicely") +
geom_edge_link(aes(width=.1*poids), alpha=.1,
end_cap = circle(0, 'mm'),
start_cap = circle(0, 'mm')) +
# geom_node_point(aes(size=eigen_centr),color="lightblue4") +
# geom_node_point(color="lightblue",size=9) +
geom_node_point(aes(size=btwns), color="white",alpha=1) +
geom_node_point(aes(color=communaute,size=btwns), alpha=.5) +
scale_size_area(max_size = 20) +
geom_node_text(aes(label=label),size=3,repel=T,box.padding = 0.15) +
# scale_size_continuous(range = c(1, 6)) +
labs(title="Réseaux de sociologues",
subtitle="Soutenances de thèses entre 2015 et 2018",
caption="Sources : theses.fr - Réalisation B. Coulmont") +
theme_graph(foreground = 'white', fg_text_colour = 'white',
base_family = "Helvetica") +
theme(legend.position="none",
text=element_text(size=16,family="Helvetica"),
plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), units="line"))
ggsave(filename = "test-reseau.pdf",width = 20,height = 20)
ggsave(filename = "test-reseau.png",width = 10,height = 10)
ggraph(g,layout="igraph",algorithm="fr") +
geom_edge_link(aes(width=.1*poids), alpha=.1,
end_cap = circle(5, 'mm'),
start_cap = circle(5, 'mm')) +
# geom_node_point(aes(size=eigen_centr),color="lightblue4") +
# geom_node_point(color="lightblue",size=9) +
geom_node_point(aes(size=eigen_centr), color="white",alpha=1) +
geom_node_point(aes(color=communaute,size=eigen_centr), alpha=.5) +
scale_size_area(max_size = 20) +
geom_node_text(aes(label=label),size=3,repel=T,box.padding = 0.15) +
# scale_size_continuous(range = c(1, 6)) +
labs(title="Réseaux de sociologues",
subtitle="Soutenances de thèses entre 2015 et 2018",
caption="Sources : theses.fr - Réalisation B. Coulmont") +
theme_graph(foreground = 'white', fg_text_colour = 'white',
base_family = "Helvetica") +
theme(legend.position="none",
text=element_text(size=16,family="Helvetica"),
plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), units="line"))
ggsave(filename = "test-reseau.pdf",width=20,height = 20)
############
#
#
#' BROUILLON
#'
#'
#### Jury
jury <- bind_cols(
noms=read_html("http://www.theses.fr/2015TOU20114") %>%
html_nodes("div .donnees p a") %>%
html_text()
,
liens=read_html("http://www.theses.fr/2015TOU20114") %>%
html_nodes("div .donnees p a") %>%
html_attr(name="href")
)
http://www.theses.fr/2017BORD0902
jury <- bind_cols(
noms=read_html("http://www.theses.fr/2015BORD0025") %>%
html_nodes("div .donnees p a") %>%
html_text()
,
liens=read_html("http://www.theses.fr/2017BORD0902") %>%
html_nodes("div .donnees p a") %>%
html_attr(name="href")
)
read_html("http://www.theses.fr/2015BORD0025") %>%
html_nodes("div .donnees p ") %>%
html_text()
read_html("http://www.theses.fr/2015BORD0025") %>%
html_nodes("div .donnees p a") %>%
html_attr(name="href")
read_html("http://www.theses.fr/2015BORD0025") %>%
html_nodes("div .donnees p span ")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment