On utilise l'offre Cloud free de Neo4j Aura pour héberger l'instance du graphe.
La méthodologie adoptée consiste à d'abord faire tourner les requêtes Cypher dans une DB Neo4j locale, puis à transférer les données dans l'instance Cloud. Pour cela, 2 possibilités :
- utiliser les fonctionnalités d'export/import csv (export du graphe complet en csv -> dépôt du csv dans un emplacment accessible sur le web (un repo Github public par ex) -> import par requêtes Cypher depuis l'instance Cloud). Le code de l'ensemble des requêtes se trouvent à la fin de ce Gist.
- utiliser la libraire push-to-cloud.jar depuis la DB locale pour pusher directement les données (mais perso j'y suis pas arrivée)
Modalité d'alimentation : DB locale -> chargement des données par requêtes -> connexion à la BD distante sur Aura -> versemnt des données
Fichier de ppn unicas fourni par le CR 66
Contraintes d'unicité
CREATE CONSTRAINT ON (s:SudocUnica) ASSERT s.ppn IS UNIQUE
CREATE CONSTRAINT ON (s:SudocNotUnica) ASSERT s.ppn IS UNIQUE
CREATE CONSTRAINT ON (b:BnfPresselocale) ASSERT b.ark IS UNIQUE
CREATE CONSTRAINT ON (b:Bib) ASSERT b.rcr IS UNIQUE
CREATE CONSTRAINT ON (d:Dpt) ASSERT d.number IS UNIQUE
Départements (avec DOM-TOM)
CREATE
(:Dpt {name:"Bouches-du-Rhône",number:"13"}),
(:Dpt {name:"Vaucluse",number:"84"}),
(:Dpt {name:"Alpes-de-Haute-Provence",number:"04"}),
(:Dpt {name:"Hautes-Alpes",number:"05"}),
(:Dpt {name:"Guadeloupe",number:"971"}),
(:Dpt {name:"Martinique",number:"972"}),
(:Dpt {name:"Guyanne",number:"973"}),
(:Dpt {name:"La Réunion",number:"974"}),
(:Dpt {name:"Saint-Pierre-et-Miquelon",number:"975"}),
(:Dpt {name:"Mayotte",number:"976"}),
(:Dpt {name:"Saint-Barthélemy",number:"977"}),
(:Dpt {name:"Saint-Martin",number:"978"}),
(:Dpt {name:"Wallis-et-Futuna",number:"986"}),
(:Dpt {name:"Polynésie française",number:"987"}),
(:Dpt {name:"Nouvelle-Calédonie",number:"988"})
Bibliothèques (Idref)
WITH "https://www.idref.fr/services/iln2rcr/229" AS url
call apoc.load.xml(url,"/sudoc/query/result") yield value as libs UNWIND libs._children as bib WITH [attr IN bib._children WHERE attr._type IN ['rcr','shortname','latitude','longitude'] | attr._text] as data
CREATE (:Bib {rcr:data[0],name:data[1],latitude:data[2],longitude:data[3]})
Relations bibs -> dpts
MATCH (b:Bib),(d:Dpt) WHERE b.rcr STARTS WITH d.number CREATE (b)-[r:LOCATED]->(d)
Chargement initial
LOAD CSV WITH HEADERS FROM
'unicas_cr66.csv'
AS line FIELDTERMINATOR ';' UNWIND line.ppn as ppn CREATE (s:SudocUnica {ppn:ppn})
ou si on dépose le fichier dans un dépôt Git accessible par http(s)
LOAD CSV WITH HEADERS FROM
'https://raw.githubusercontent.com/gegedenice/divers-files/master/unicas_cr66.csv'
AS line FIELDTERMINATOR ';' UNWIND line.ppn as ppn CREATE (s:SudocUnica {ppn:ppn})
Métadonnées biblio Sudoc
MATCH (s:SudocUnica) UNWIND s.ppn as ppn call apoc.load.xml("https://www.sudoc.fr/"+ppn+".xml",null,{failOnError:false}) yield value as record
UNWIND record._children as fields
WITH s,fields,[attr IN fields._children WHERE attr.code IN ['a'] | attr._text] as data WHERE fields.tag IN ["011","200","309"] CALL apoc.create.setProperty(s, CASE fields.tag
WHEN "011" THEN "issn"
WHEN "200" THEN "titre"
WHEN "309" THEN "controle"
END, data[0]) YIELD node return node
Relations unicas -> bibs
MATCH (s:SudocUnica) UNWIND s.ppn as ppn call apoc.load.xml("https://www.sudoc.fr/services/multiwhere/"+ppn,"/sudoc/query/result/library/rcr",{failOnError:false}) yield value as item MATCH (b:Bib {rcr:item._text}) CREATE (s)-[:OWNED_BY]->(b)
Propriétés état de collection
MATCH (s:SudocUnica)-[rel:OWNED_BY]-(b) UNWIND s.ppn as ppn call apoc.load.xml("https://www.sudoc.fr/"+ppn+".xml",null,{failOnError:false}) yield value as record
UNWIND record._children as fields WITH rel,fields,[attr IN fields._children WHERE attr.code = 'r'| attr._text] as data WHERE fields.tag = "955" SET rel.etat_de_collection=data[0]
Bouches-du-Rhône (13) : 2230 titres
Vaucluse (84) : 502 titres
Alpes-de-Hautes-Provence (04) : 265
Guadeloupe (971) : 104 titres
Martinique (972): 102 titres
Guyanne (973) : 42 titres
La Réunion (974) : 148 titres
Mayotte (976) : 141 titres
Hautes-Alpes (05) : 0 titre
Saint-Pierre-et-Miquelon (975) :o titre
Saint-Barthélemy (977) : 0 titre
Saint-Martin (978) : 0 titre
Wallis-et-Futuna (986) : 0 titre
Polynésie française (987) : 0 titre
Nouvelle-Calédonie (988) : o titre
Bouches-du-Rhône : 1er lot
CALL apoc.load.html("http://presselocaleancienne.bnf.fr/cherche?&av=true&equRech1=mot&mot1=&typeRechCritere1=all&relation=et&equRech2=mot&mot2=&typeRechCritere2=all&type=tout&territoire=d13&siecle=&annee=&anneeFac=&page=1&taille=745&nombrePages=3",{a:"a"}) YIELD value UNWIND value.a as record WITH record MATCH (d:Dpt) WHERE d.number="13" WITH d,record WHERE record.attributes.href CONTAINS "/ark:/" MATCH (d) MERGE (b:BnfPresselocale {ark:record.attributes.href}) MERGE (b)-[rel:IS_ABOUT]->(d)
Bouches-du-Rhône : 2ème lot
CALL apoc.load.html("http://presselocaleancienne.bnf.fr/cherche?&av=true&equRech1=mot&mot1=&typeRechCritere1=all&relation=et&equRech2=mot&mot2=&typeRechCritere2=all&type=tout&territoire=d13&siecle=&annee=&anneeFac=&page=2&taille=745&nombrePages=3",{a:"a"}) YIELD value UNWIND value.a as record WITH record MATCH (d:Dpt) WHERE d.number="13" WITH d,record WHERE record.attributes.href CONTAINS "/ark:/" MATCH (d) MERGE (b:BnfPresselocale {ark:record.attributes.href}) MERGE (b)-[rel:IS_ABOUT]->(d)
Bouches-du-Rhône : 3ème lot
CALL apoc.load.html("http://presselocaleancienne.bnf.fr/cherche?&av=true&equRech1=mot&mot1=&typeRechCritere1=all&relation=et&equRech2=mot&mot2=&typeRechCritere2=all&type=tout&territoire=d13&siecle=&annee=&anneeFac=&page=3&taille=745&nombrePages=3",{a:"a"}) YIELD value UNWIND value.a as record WITH record MATCH (d:Dpt) WHERE d.number="13" WITH d,record WHERE record.attributes.href CONTAINS "/ark:/" MATCH (d) MERGE (b:BnfPresselocale {ark:record.attributes.href}) MERGE (b)-[rel:IS_ABOUT]->(d)
Vaucluse & Alpes-de-Haute-Provence
UNWIND ["84","04"] as dep CALL apoc.load.html("http://presselocaleancienne.bnf.fr/cherche?&av=true&equRech1=mot&mot1=&typeRechCritere1=all&relation=et&equRech2=mot&mot2=&typeRechCritere2=all&type=tout&territoire=d"+dep+"&siecle=&annee=&anneeFac=&page=1&taille=600",{a:"a"}) YIELD value UNWIND value.a as record WITH dep as dpt,record MATCH (d:Dpt) WHERE d.number=dpt WITH d,record WHERE record.attributes.href CONTAINS "/ark:/" MATCH (d) MERGE (b:BnfPresselocale {ark:record.attributes.href}) MERGE (b)-[rel:IS_ABOUT]->(d)
DOM-TOM
UNWIND ["971","972","973","974","976"] as dep CALL apoc.load.html("http://presselocaleancienne.bnf.fr/cherche?&av=true&equRech1=mot&mot1=&typeRechCritere1=all&relation=et&equRech2=mot&mot2=&typeRechCritere2=all&type=tout&territoire=d"+dep+"&siecle=&annee=&anneeFac=&page=1&taille=150",{a:"a"}) YIELD value UNWIND value.a as record WITH dep as dpt,record MATCH (d:Dpt) WHERE d.number=dpt WITH d,record WHERE record.attributes.href CONTAINS "/ark:/" MATCH (d) MERGE (b:BnfPresselocale {ark:record.attributes.href}) MERGE (b)-[rel:IS_ABOUT]->(d)
Vérification que les doubles liens presselocale -> dpts aient bien fonctionné
MATCH (b:BnfPresselocale)-[i:IS_ABOUT]->(d:Dpt) WITH b,count(i) as rel_cnt WHERE rel_cnt > 1 RETURN *
MATCH (b:BnfPresselocale) UNWIND b.ark as ark call apoc.load.xml("http://catalogue.bnf.fr/api/SRU?version=1.2&operation=searchRetrieve&query=bib.persistentid%20adj%20%22"+replace(ark,'/ark:/','ark:/')+"%22&recordSchema=unimarcxchange",null,{failOnError:false}) yield value as record UNWIND record._children[3]._children[0]._children[2]._children[0]._children as fields WITH b,fields,[attr IN fields._children WHERE attr.code IN ['a','d'] | attr._text] as data WHERE fields.tag IN ["011","200","210"] CALL apoc.create.setProperty(b, CASE fields.tag
WHEN "011" THEN "issn"
WHEN "200" THEN "titre"
WHEN "210" THEN "edition"
END, apoc.text.join(data, '|')) YIELD node return node
Vérification que tous les neouds presselocale aient été balayés
MATCH (p:BnfPresselocale) WHERE not exists(p.titre) return p
MATCH (b:BnfPresselocale) UNWIND b.ark as ark
call apoc.load.xml("http://catalogue.bnf.fr/api/SRU?version=1.2&operation=searchRetrieve&query=bib.persistentid%20adj%20%22"+replace(ark,'/ark:/','ark:/')+"%22&recordSchema=unimarcxchange",null,{failOnError:false}) yield value as record UNWIND record._children[3]._children[0]._children[2]._children[0]._children as fields WITH b,fields,[attr IN fields._children WHERE attr.code IN ['d','u'] | attr._text] as data WHERE fields.tag IN ["325"]
CREATE (b)-[:HAS_VERSION]->(n:Numerisation {etab:data[1],url:data[0]})
MATCH (s:SudocUnica),(b:BnfPresselocale) WHERE EXISTS(s.issn) AND EXISTS(b.issn) AND s.issn=b.issn CREATE (s)-[:SAME_AS]->(b)
Ajout noeuds & relations bibs <- sudoc non unicas -> presselocale (pour les titres de presse locale qui ont une notice équivalente dans le Sudoc sans que cette notice soit un unica)
Alignement notices Sudoc/BnF
MATCH (b:BnfPresselocale) WHERE NOT (b)<-[:SAME_AS]-() AND EXISTS(b.issn) UNWIND b.issn as issn CALL apoc.load.xml("http://www.sudoc.fr/services/issn2ppn/"+issn,"/sudoc/query/result",{failOnError:false}) YIELD value
UNWIND value._children[0]._text AS ppn with b,ppn,issn MATCH (b) WHERE b.issn = issn MERGE (s:SudocNotUnica {issn:issn,ppn:ppn,titre:b.titre}) MERGE (s)-[r:SAME_AS]->(b)
Localisation si bib du réseau du CR
MATCH (bnf:BnfPresselocale)<-[:SAME_AS]-(s:SudocNotUnica) UNWIND s.ppn as ppn CALL apoc.load.xml("http://www.sudoc.fr/services/multiwhere/"+ppn,"/sudoc/query/result",{failOnError:false}) YIELD value AS value WITH s,value UNWIND value._children as lib MATCH (b:Bib) WHERE lib._children[0]._text = b.rcr CREATE (s)-[r:OWNED_BY]->(b)
MATCH (n) SET n.degre=size( (n)-[]-() ) RETURN *
CALL apoc.graph.fromDB('cr_66', {}) YIELD graph
CALL apoc.export.csv.graph(
graph,
'exportedGraph_cr66.csv',
null
) YIELD file, nodes, relationships
RETURN file, nodes, relationships
Dépôt du csv dans https://github.com/gegedenice/divers-files.
On peut donc ré-importer les données dans une autre instance Neo4j pa rle protocole http https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE EXISTS(line._id) AND line._labels = ":Dpt" CREATE (p:Dpt {initId:line._id,number:line.number,name:line.name,degre:line.degre}) return p
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE EXISTS(line._id) AND line._labels = ":Bib" CREATE (p:Bib {initId:line._id,name:line.name,rcr:line.rcr,latitude:line.latitude,longitude:line.longitude,degre:line.degre}) return p
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE EXISTS(line._id) AND line._labels = ":SudocUnica" CREATE (p:SudocUnica {initId:line._id,ppn:line.ppn,issn:line.issn,titre:line.titre,controle:line.controle,degre:line.degre}) return p
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE EXISTS(line._id) AND line._labels = ":SudocNotUnica" CREATE (p:SudocNotUnica {initId:line._id,ppn:line.ppn,issn:line.issn,titre:line.titre,degre:line.degre}) return p
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE EXISTS(line._id) AND line._labels = ":BnfPresselocale" CREATE (p:BnfPresselocale {initId:line._id,ark:line.ark,issn:line.issn,titre:line.titre,edition:line.edition,degre:line.degre}) return p
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE EXISTS(line._id) AND line._labels = ":Numerisation" CREATE (p:Numerisation {initId:line._id,url:line.url,etab:line.etab,degre:line.degre}) return p
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE line._type = "LOCATED" MATCH (b:Bib),(d:Dpt) WHERE b.initId=line._start AND d.initId=line._end CREATE (b)-[:LOCATED]->(d) return *
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE line._type = "OWNED_BY" MATCH (s),(b:Bib) WHERE s.initId=line._start AND b.initId=line._end CREATE (s)-[:OWNED_BY {etat_de_collection:line.etat_de_collection}]->(b) return *
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE line._type = "IS_ABOUT" MATCH (b:BnfPresselocale),(d:Dpt) WHERE b.initId=line._start AND d.initId=line._end CREATE (b)-[:IS_ABOUT]->(d) return *
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE line._type = "SAME_AS" MATCH (b:BnfPresselocale),(s) WHERE s.initId=line._start AND b.initId=line._end CREATE (s)-[:SAME_AS]->(b) return *
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/gegedenice/divers-files/master/exportedGraph_cr66.csv" AS line FIELDTERMINATOR ',' WITH line WHERE line._type = "HAS_VERSION" MATCH (b:BnfPresselocale),(n:Numerisation) WHERE b.initId=line._start AND n.initId=line._end CREATE (b)-[:HAS_VERSION]->(n) return *
MATCH (s:SudocUnica) where s.issn = "" REMOVE s.issn return *
MATCH (s:BnfPresselocale) where s.issn = "" REMOVE s.issn return *
MATCH (s:SudocUnica) where s.controle = "" REMOVE s.controle return *