Skip to content

Instantly share code, notes, and snippets.

@bdelbosc
Created June 28, 2013 08:43
Show Gist options
  • Save bdelbosc/5883371 to your computer and use it in GitHub Desktop.
Save bdelbosc/5883371 to your computer and use it in GitHub Desktop.
Dummy script to generate Nuxeo SQL dump that can be used for mass import, create Nuxeo documents using dbpedia.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Extract dbpedia summaries and export Nuxeo SQL dump.
It requires dbpediakit https://github.com/ogrisel/dbpediakit.
"""
import sys
import random
import itertools
import csv
from datetime import datetime
from dbpediakit import fetch, extract_text
#NB_FOLDER=50000
#NB_FILE=1000000
#NB_FOLDER = 1000 * 1000
#NB_FILE = 100 * 1000 * 1000
NB_FILE = 5 * 1000
NB_FOLDER = 1 * 1000
def getParentForDoc(doc_num):
global NB_FOLDER
return doc_num % NB_FOLDER
def getParentForFolder(i):
if i == 0:
return None
if i < 10:
return 0
return int(str(i)[:-1])
def genHierarchyFolder(fid):
pfid = getParentForFolder(fid)
if pfid is None:
pfid = '\N'
else:
pfid = 'ffffffff-ffff-ffff-0000-%12.12d' % pfid
isproperty = 'f'
primarytype = 'Folder'
minor = '\N'
major = '\N'
print "ffffffff-ffff-ffff-0000-%12.12d\t%s\tfolder-%4.4d\t%s\t%s\t%s\t%s" % (fid, pfid, fid, isproperty,
primarytype, minor, major)
def genHierarchyDoc(fid, did):
isproperty = 'f'
primarytype = 'File'
minor = '0'
major = '1'
print "dddddddd-dddd-dddd-0000-%12.12d\tffffffff-ffff-ffff-0000-%12.12d\tfile-%10.10d\t%s\t%s\t%s\t%s" % (did, fid, did / 2, isproperty,
primarytype, minor, major)
isproperty = 't'
primarytype = 'content'
print "cccccccc-cccc-cccc-0000-%12.12d\tdddddddd-dddd-dddd-0000-%12.12d\t\%s\t%s\t%s\t%s\t%s" % (did + 1, did, primarytype, isproperty,
primarytype, minor, major)
def genHierarchy():
global NB_FILE
global NB_FOLDER
print "COPY hierarchy (id, parentid, name, isproperty, primarytype, majorversion, minorversion) FROM stdin;"
i = 0
for fid in xrange(NB_FOLDER):
genHierarchyFolder(fid)
for did in xrange(NB_FILE / NB_FOLDER):
genHierarchyDoc(fid, i * 2)
i += 1
print "\\."
def clean_text(text):
# print type(text)
# print text
if isinstance(text, str):
text = text.decode('utf-8', 'ignore')
return text.encode('utf-8', 'ignore').replace('\t', ' ')
def get_db_article_iterator(a, b, c):
for i, j, k in itertools.izip(a, b, c):
yield i
yield j
yield k
def genDublincore():
global NB_FILE
global NB_FOLDER
print "COPY dublincore (id, creator, source, created, description, language, title) FROM stdin WITH CSV;"
source = 'gendata'
created = datetime.utcnow().isoformat(' ')[:23]
languages = ['fr', 'de', 'en']
writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONNUMERIC)
for did in xrange(0, NB_FILE * 2, 2):
article = dbpedia.next()
creator = 'user' + str(did % 40 + 1)
description = clean_text(article.text) + " DID_%d" % did
title = clean_text(article.title)
lang = article.lang
writer.writerow(("dddddddd-dddd-dddd-0000-%12.12d" % did, creator, source, created, description, lang, title))
# print "dddddddd-dddd-dddd-0000-%12.12d\t%s\t%s\t%s\t%s\t%s\t%s" % (did, creator, source, created, description, lang, title)
for fid in xrange(NB_FOLDER):
creator = 'user' + str(fid % 40 + 1)
description = "description %d" % fid
title = "title %d" % fid
writer.writerow(("ffffffff-ffff-ffff-0000-%12.12d" % fid, creator, source, created, description, random.choice(languages), title))
# print "ffffffff-ffff-ffff-0000-%12.12d\t%s\t%s\t%s\t%s\t%s\t%s" % (fid, creator, source, created, description, random.choice(languages), title)
print "\\."
def genContent():
global NB_FILE
print "COPY content FROM stdin;"
for did in xrange(1, NB_FILE * 2, 2):
print "cccccccc-cccc-cccc-0000-%12.12d\tfile-%d.odt\t5980\tfecb537c49dc544e28b425e0b1c3e06b\t\N\t\N\tapplication/vnd.oasis.opendocument.text" % (
did, (did - 1) / 2)
print "\\."
def genUid():
global NB_FILE
print "COPY uid FROM stdin;"
for did in xrange(0, NB_FILE * 2, 2):
print "dddddddd-dddd-dddd-0000-%12.12d\t\N" % did
for fid in xrange(NB_FOLDER):
print "ffffffff-ffff-ffff-0000-%12.12d\t\N" % fid
print "\\."
def genMisc():
global NB_FILE
global NB_FOLDER
print "COPY misc FROM stdin;"
for did in xrange(0, NB_FILE * 2, 2):
print "dddddddd-dddd-dddd-0000-%12.12d\tdefault\tproject" % did
for fid in xrange(NB_FOLDER):
print "ffffffff-ffff-ffff-0000-%12.12d\tdefault\tproject" % fid
print "\\."
def genContributor():
global NB_FILE
global NB_FOLDER
print "COPY dc_contributors FROM stdin;"
for did in xrange(0, NB_FILE * 2, 2):
user = 'user' + str(did % 40 + 1)
print "dddddddd-dddd-dddd-0000-%12.12d\t0\t%s" % (did, user)
for fid in xrange(NB_FOLDER):
user = 'user' + str(did % 40 + 1)
print "ffffffff-ffff-ffff-0000-%12.12d\t0\t%s" % (fid, user)
print "\\."
def genFile():
global NB_FILE
print "COPY file FROM stdin;"
for did in xrange(0, NB_FILE * 2, 2):
print "dddddddd-dddd-dddd-0000-%12.12d\tfile-%d.odt" % (did, did / 2)
print "\\."
def genCommon():
global NB_FILE
global NB_FOLDER
print "COPY common FROM stdin;"
for did in xrange(0, NB_FILE * 2, 2):
print "dddddddd-dddd-dddd-0000-%12.12d\t/icons/odt.png\t\N\t\N" % did
for fid in xrange(NB_FOLDER):
print "ffffffff-ffff-ffff-0000-%12.12d\t/icons/folder.gif\t\N\t\N" % fid
print "\\."
def printDDL():
print """CREATE TABLE hierarchy (
id character varying(36) NOT NULL,
parentid character varying(36),
pos integer,
name character varying,
isproperty boolean,
primarytype character varying(250),
mixintypes character varying(250)[],
ischeckedin boolean,
baseversionid character varying(36),
majorversion bigint,
minorversion bigint,
isversion boolean
);
CREATE TABLE dublincore (
id character varying(36) NOT NULL,
creator character varying,
nature character varying,
source character varying,
created timestamp without time zone,
description character varying,
rights character varying,
valid timestamp without time zone,
format character varying,
issued timestamp without time zone,
modified timestamp without time zone,
language character varying,
coverage character varying,
expired timestamp without time zone,
lastcontributor character varying,
title character varying
);
"""
def printDDL2():
print """CREATE TABLE content (
id character varying(36) NOT NULL,
name character varying,
length bigint,
data character varying(40),
"encoding" character varying,
digest character varying,
"mime-type" character varying
);
CREATE TABLE uid (
id character varying(36) NOT NULL,
uid character varying
);
CREATE TABLE dc_contributors (
id character varying(36) NOT NULL,
pos integer,
item character varying
);
CREATE TABLE file (
id character varying(36) NOT NULL,
filename character varying
);
"""
def printDDL3():
print """CREATE TABLE misc (
id character varying(36) NOT NULL,
lifecyclepolicy character varying(250),
lifecyclestate character varying(250)
);
CREATE TABLE common (
id character varying(36) NOT NULL,
icon character varying,
"icon-expanded" character varying,
size bigint
);
"""
def removeAll():
print "delete from hierarchy where id like 'ffffff%';"
def imerge(items):
for item in items:
for i in item:
yield i
DDL = False;
dbp_files = []
#for lang in ['fr', 'de', 'en']:
for lang in ['fr', 'de', 'en']:
dbp_files.append(fetch('long_abstracts', lang=lang))
dbp_iters = []
for dbp_file in dbp_files:
dbp_iters.append(extract_text(dbp_file).__iter__())
dbpedia = imerge(dbp_iters)
if True:
if DDL:
printDDL()
genHierarchy()
genDublincore()
if True:
if DDL:
printDDL2()
genContent()
genUid()
genContributor()
genFile()
if True:
if DDL:
printDDL3()
genMisc()
genCommon()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment