Skip to content

Instantly share code, notes, and snippets.

@ettorerizza
Created May 30, 2017 16:11
Show Gist options
  • Save ettorerizza/e860249572212fdb6c83167994eaffc4 to your computer and use it in GitHub Desktop.
Save ettorerizza/e860249572212fdb6c83167994eaffc4 to your computer and use it in GitHub Desktop.
Script R pour parser les 26 000 XML/TEI du corpus européen JRC-Acquis et leur ajouter leurs descripteurs eurovoc
library(XML)
library(dplyr)
library(stringr)
library(readr)
library(readxl)
library(tidyr)
#liste des fichiers XML du corpus JRC Acquis version anglaise (http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/jrc-en.tgz)
liste <-
list.files(
path = "C:/Users/ettor/Desktop/Eurovoc/JRC Acqis Corpus/jrc-en/en",
recursive = TRUE,
ignore.case = FALSE,
include.dirs = FALSE,
full.names = TRUE
)
#fonction pour parser les TEI
parseTei <- function(fichier, path, xmlelement) {
text_parsed <- tryCatch({
doc <- xmlParse(fichier, trim = FALSE)
text <- unlist(xpathSApply(doc, path, xmlelement))
text <- gsub("%quot%", '"', text)
},
error = function(cond) {
message(paste("file does not seem to exist:", fichier))
message("Here's the original error message:")
message(cond)
# Choose a return value in case of error
return(NA)
},
warning = function(cond) {
message(paste("file caused a warning:", fichier))
message("Here's the original warning message:")
message(cond)
return(NULL)
},
finally = {
message(paste("Processed file:", fichier))
})
return(c(fichier, text_parsed))
}
#récupération des titres, body and filename
body <-
lapply(liste,
parseTei,
path = '//body|/TEI.2/teiHeader/fileDesc/titleStmt/title[1]',
xmlelement = xmlValue)
#list to dataframe
library(plyr)
dfrm = ldply(body, rbind)
colnames(dfrm) <- c("file", "doc", "body")
#ajout d'une colonne file_id (oublié de l'extraire des XML...)
dfrm <-
dfrm %>%
mutate(file_id = str_extract(doc, "\\S+\\d+\\S+"))
#fichier texte contenant les codes eurovocs (http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/jrc-acquis-eurovoc-descriptors.txt)
jrc_eurovoc <-
read_delim(
"C:/Users/ettor/Desktop/Eurovoc/JRC Acqis Corpus/jrc-acquis-eurovoc-descriptors.txt",
"\t",
escape_double = FALSE,
col_names = c("file_id", "X2", "eurovocs"),
col_types = cols(X2 = col_skip()),
trim_ws = TRUE
)
jrc_eurovocs_tidy <-
jrc_eurovoc %>%
tidyr::separate_rows(eurovocs, sep = " ")
#fichier contenant le label des codes eurovoc en anglais (http://eurovoc.europa.eu/drupal/?q=fr/download/list_pt&cl=en)
listPt <-
read_excel("C:/Users/ettor/Desktop/Eurovoc/listPt EurovocEN.xls")
#jointure entre eurovoc_tidy et la liste des labels
jrc_eurovocs_tidy <-
jrc_eurovocs_tidy %>%
left_join(listPt, by = c("eurovocs" = "ID"))
View(jrc_eurovocs_tidy)
#concatenation des lables dans une seule cellule, pour plus de lisibilité
library(data.table)
final_concat <-
setDT(jrc_eurovocs_tidy)[, .(concat_eurovocs = paste(EN, collapse = " || ")), by = .(file_id)]
#on réunit le tout
final_merge <-
dfrm %>%
left_join(final_concat) %>%
na.omit()
data.table::fwrite(final_merge,"final_merge.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment