ettorerizza/parse_jrc-acquis

## parse_jrc-acquis
library(XML)
library(dplyr)
library(stringr)
library(readr)
library(readxl)
library(tidyr)

#liste des fichiers XML du corpus JRC Acquis version anglaise (http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/jrc-en.tgz)
liste <-
  list.files(
    path = "C:/Users/ettor/Desktop/Eurovoc/JRC Acqis Corpus/jrc-en/en",
    recursive = TRUE,
    ignore.case = FALSE,
    include.dirs = FALSE,
    full.names = TRUE
  )

#fonction pour parser les TEI
parseTei <- function(fichier, path, xmlelement) {
  text_parsed <- tryCatch({
    doc <- xmlParse(fichier,   trim = FALSE)

    text <- unlist(xpathSApply(doc, path, xmlelement))

    text <- gsub("%quot%", '"', text)

  },
  error = function(cond) {
    message(paste("file does not seem to exist:", fichier))
    message("Here's the original error message:")
    message(cond)
    # Choose a return value in case of error
    return(NA)
  },
  warning = function(cond) {
    message(paste("file caused a warning:", fichier))
    message("Here's the original warning message:")
    message(cond)
    return(NULL)
  },
  finally = {
    message(paste("Processed file:", fichier))
  })
  return(c(fichier, text_parsed))
}

#récupération des titres, body and filename
body <-
  lapply(liste,
         parseTei,
         path = '//body|/TEI.2/teiHeader/fileDesc/titleStmt/title[1]',
         xmlelement = xmlValue)

#list to dataframe
library(plyr)
dfrm = ldply(body, rbind)
colnames(dfrm) <- c("file", "doc", "body")

#ajout d'une colonne file_id (oublié de l'extraire des XML...)
dfrm <-
  dfrm %>%
  mutate(file_id = str_extract(doc, "\\S+\\d+\\S+"))

#fichier texte contenant les codes eurovocs (http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/jrc-acquis-eurovoc-descriptors.txt)
jrc_eurovoc <-
  read_delim(
    "C:/Users/ettor/Desktop/Eurovoc/JRC Acqis Corpus/jrc-acquis-eurovoc-descriptors.txt",
    "\t",
    escape_double = FALSE,
    col_names = c("file_id", "X2", "eurovocs"),
    col_types = cols(X2 = col_skip()),
    trim_ws = TRUE
  )

jrc_eurovocs_tidy <-
  jrc_eurovoc %>%
  tidyr::separate_rows(eurovocs, sep = " ")

#fichier contenant le label des codes eurovoc en anglais (http://eurovoc.europa.eu/drupal/?q=fr/download/list_pt&cl=en)
listPt <-
  read_excel("C:/Users/ettor/Desktop/Eurovoc/listPt EurovocEN.xls")


#jointure entre eurovoc_tidy et la liste des labels
jrc_eurovocs_tidy <-
  jrc_eurovocs_tidy %>%
  left_join(listPt, by = c("eurovocs" = "ID"))
View(jrc_eurovocs_tidy)

#concatenation des lables dans une seule cellule, pour plus de lisibilité
library(data.table)
final_concat <-
  setDT(jrc_eurovocs_tidy)[, .(concat_eurovocs = paste(EN, collapse = " || ")), by = .(file_id)]

#on réunit le tout
final_merge <-
  dfrm %>%
  left_join(final_concat) %>%
  na.omit()

data.table::fwrite(final_merge,"final_merge.csv")
	library(XML)
	library(dplyr)
	library(stringr)
	library(readr)
	library(readxl)
	library(tidyr)

	#liste des fichiers XML du corpus JRC Acquis version anglaise (http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/jrc-en.tgz)
	liste <-
	list.files(
	path = "C:/Users/ettor/Desktop/Eurovoc/JRC Acqis Corpus/jrc-en/en",
	recursive = TRUE,
	ignore.case = FALSE,
	include.dirs = FALSE,
	full.names = TRUE
	)

	#fonction pour parser les TEI
	parseTei <- function(fichier, path, xmlelement) {
	text_parsed <- tryCatch({
	doc <- xmlParse(fichier, trim = FALSE)

	text <- unlist(xpathSApply(doc, path, xmlelement))

	text <- gsub("%quot%", '"', text)

	},
	error = function(cond) {
	message(paste("file does not seem to exist:", fichier))
	message("Here's the original error message:")
	message(cond)
	# Choose a return value in case of error
	return(NA)
	},
	warning = function(cond) {
	message(paste("file caused a warning:", fichier))
	message("Here's the original warning message:")
	message(cond)
	return(NULL)
	},
	finally = {
	message(paste("Processed file:", fichier))
	})
	return(c(fichier, text_parsed))
	}

	#récupération des titres, body and filename
	body <-
	lapply(liste,
	parseTei,
	path = '//body\|/TEI.2/teiHeader/fileDesc/titleStmt/title[1]',
	xmlelement = xmlValue)

	#list to dataframe
	library(plyr)
	dfrm = ldply(body, rbind)
	colnames(dfrm) <- c("file", "doc", "body")

	#ajout d'une colonne file_id (oublié de l'extraire des XML...)
	dfrm <-
	dfrm %>%
	mutate(file_id = str_extract(doc, "\\S+\\d+\\S+"))

	#fichier texte contenant les codes eurovocs (http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/jrc-acquis-eurovoc-descriptors.txt)
	jrc_eurovoc <-
	read_delim(
	"C:/Users/ettor/Desktop/Eurovoc/JRC Acqis Corpus/jrc-acquis-eurovoc-descriptors.txt",
	"\t",
	escape_double = FALSE,
	col_names = c("file_id", "X2", "eurovocs"),
	col_types = cols(X2 = col_skip()),
	trim_ws = TRUE
	)

	jrc_eurovocs_tidy <-
	jrc_eurovoc %>%
	tidyr::separate_rows(eurovocs, sep = " ")

	#fichier contenant le label des codes eurovoc en anglais (http://eurovoc.europa.eu/drupal/?q=fr/download/list_pt&cl=en)
	listPt <-
	read_excel("C:/Users/ettor/Desktop/Eurovoc/listPt EurovocEN.xls")


	#jointure entre eurovoc_tidy et la liste des labels
	jrc_eurovocs_tidy <-
	jrc_eurovocs_tidy %>%
	left_join(listPt, by = c("eurovocs" = "ID"))
	View(jrc_eurovocs_tidy)

	#concatenation des lables dans une seule cellule, pour plus de lisibilité
	library(data.table)
	final_concat <-
	setDT(jrc_eurovocs_tidy)[, .(concat_eurovocs = paste(EN, collapse = " \|\| ")), by = .(file_id)]

	#on réunit le tout
	final_merge <-
	dfrm %>%
	left_join(final_concat) %>%
	na.omit()

	data.table::fwrite(final_merge,"final_merge.csv")