Created
June 28, 2013 08:43
-
-
Save bdelbosc/5883371 to your computer and use it in GitHub Desktop.
Dummy script to generate Nuxeo SQL dump that can be used for mass import, create Nuxeo documents using dbpedia.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
Extract dbpedia summaries and export Nuxeo SQL dump. | |
It requires dbpediakit https://github.com/ogrisel/dbpediakit. | |
""" | |
import sys | |
import random | |
import itertools | |
import csv | |
from datetime import datetime | |
from dbpediakit import fetch, extract_text | |
#NB_FOLDER=50000 | |
#NB_FILE=1000000 | |
#NB_FOLDER = 1000 * 1000 | |
#NB_FILE = 100 * 1000 * 1000 | |
NB_FILE = 5 * 1000 | |
NB_FOLDER = 1 * 1000 | |
def getParentForDoc(doc_num): | |
global NB_FOLDER | |
return doc_num % NB_FOLDER | |
def getParentForFolder(i): | |
if i == 0: | |
return None | |
if i < 10: | |
return 0 | |
return int(str(i)[:-1]) | |
def genHierarchyFolder(fid): | |
pfid = getParentForFolder(fid) | |
if pfid is None: | |
pfid = '\N' | |
else: | |
pfid = 'ffffffff-ffff-ffff-0000-%12.12d' % pfid | |
isproperty = 'f' | |
primarytype = 'Folder' | |
minor = '\N' | |
major = '\N' | |
print "ffffffff-ffff-ffff-0000-%12.12d\t%s\tfolder-%4.4d\t%s\t%s\t%s\t%s" % (fid, pfid, fid, isproperty, | |
primarytype, minor, major) | |
def genHierarchyDoc(fid, did): | |
isproperty = 'f' | |
primarytype = 'File' | |
minor = '0' | |
major = '1' | |
print "dddddddd-dddd-dddd-0000-%12.12d\tffffffff-ffff-ffff-0000-%12.12d\tfile-%10.10d\t%s\t%s\t%s\t%s" % (did, fid, did / 2, isproperty, | |
primarytype, minor, major) | |
isproperty = 't' | |
primarytype = 'content' | |
print "cccccccc-cccc-cccc-0000-%12.12d\tdddddddd-dddd-dddd-0000-%12.12d\t\%s\t%s\t%s\t%s\t%s" % (did + 1, did, primarytype, isproperty, | |
primarytype, minor, major) | |
def genHierarchy(): | |
global NB_FILE | |
global NB_FOLDER | |
print "COPY hierarchy (id, parentid, name, isproperty, primarytype, majorversion, minorversion) FROM stdin;" | |
i = 0 | |
for fid in xrange(NB_FOLDER): | |
genHierarchyFolder(fid) | |
for did in xrange(NB_FILE / NB_FOLDER): | |
genHierarchyDoc(fid, i * 2) | |
i += 1 | |
print "\\." | |
def clean_text(text): | |
# print type(text) | |
# print text | |
if isinstance(text, str): | |
text = text.decode('utf-8', 'ignore') | |
return text.encode('utf-8', 'ignore').replace('\t', ' ') | |
def get_db_article_iterator(a, b, c): | |
for i, j, k in itertools.izip(a, b, c): | |
yield i | |
yield j | |
yield k | |
def genDublincore(): | |
global NB_FILE | |
global NB_FOLDER | |
print "COPY dublincore (id, creator, source, created, description, language, title) FROM stdin WITH CSV;" | |
source = 'gendata' | |
created = datetime.utcnow().isoformat(' ')[:23] | |
languages = ['fr', 'de', 'en'] | |
writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONNUMERIC) | |
for did in xrange(0, NB_FILE * 2, 2): | |
article = dbpedia.next() | |
creator = 'user' + str(did % 40 + 1) | |
description = clean_text(article.text) + " DID_%d" % did | |
title = clean_text(article.title) | |
lang = article.lang | |
writer.writerow(("dddddddd-dddd-dddd-0000-%12.12d" % did, creator, source, created, description, lang, title)) | |
# print "dddddddd-dddd-dddd-0000-%12.12d\t%s\t%s\t%s\t%s\t%s\t%s" % (did, creator, source, created, description, lang, title) | |
for fid in xrange(NB_FOLDER): | |
creator = 'user' + str(fid % 40 + 1) | |
description = "description %d" % fid | |
title = "title %d" % fid | |
writer.writerow(("ffffffff-ffff-ffff-0000-%12.12d" % fid, creator, source, created, description, random.choice(languages), title)) | |
# print "ffffffff-ffff-ffff-0000-%12.12d\t%s\t%s\t%s\t%s\t%s\t%s" % (fid, creator, source, created, description, random.choice(languages), title) | |
print "\\." | |
def genContent(): | |
global NB_FILE | |
print "COPY content FROM stdin;" | |
for did in xrange(1, NB_FILE * 2, 2): | |
print "cccccccc-cccc-cccc-0000-%12.12d\tfile-%d.odt\t5980\tfecb537c49dc544e28b425e0b1c3e06b\t\N\t\N\tapplication/vnd.oasis.opendocument.text" % ( | |
did, (did - 1) / 2) | |
print "\\." | |
def genUid(): | |
global NB_FILE | |
print "COPY uid FROM stdin;" | |
for did in xrange(0, NB_FILE * 2, 2): | |
print "dddddddd-dddd-dddd-0000-%12.12d\t\N" % did | |
for fid in xrange(NB_FOLDER): | |
print "ffffffff-ffff-ffff-0000-%12.12d\t\N" % fid | |
print "\\." | |
def genMisc(): | |
global NB_FILE | |
global NB_FOLDER | |
print "COPY misc FROM stdin;" | |
for did in xrange(0, NB_FILE * 2, 2): | |
print "dddddddd-dddd-dddd-0000-%12.12d\tdefault\tproject" % did | |
for fid in xrange(NB_FOLDER): | |
print "ffffffff-ffff-ffff-0000-%12.12d\tdefault\tproject" % fid | |
print "\\." | |
def genContributor(): | |
global NB_FILE | |
global NB_FOLDER | |
print "COPY dc_contributors FROM stdin;" | |
for did in xrange(0, NB_FILE * 2, 2): | |
user = 'user' + str(did % 40 + 1) | |
print "dddddddd-dddd-dddd-0000-%12.12d\t0\t%s" % (did, user) | |
for fid in xrange(NB_FOLDER): | |
user = 'user' + str(did % 40 + 1) | |
print "ffffffff-ffff-ffff-0000-%12.12d\t0\t%s" % (fid, user) | |
print "\\." | |
def genFile(): | |
global NB_FILE | |
print "COPY file FROM stdin;" | |
for did in xrange(0, NB_FILE * 2, 2): | |
print "dddddddd-dddd-dddd-0000-%12.12d\tfile-%d.odt" % (did, did / 2) | |
print "\\." | |
def genCommon(): | |
global NB_FILE | |
global NB_FOLDER | |
print "COPY common FROM stdin;" | |
for did in xrange(0, NB_FILE * 2, 2): | |
print "dddddddd-dddd-dddd-0000-%12.12d\t/icons/odt.png\t\N\t\N" % did | |
for fid in xrange(NB_FOLDER): | |
print "ffffffff-ffff-ffff-0000-%12.12d\t/icons/folder.gif\t\N\t\N" % fid | |
print "\\." | |
def printDDL(): | |
print """CREATE TABLE hierarchy ( | |
id character varying(36) NOT NULL, | |
parentid character varying(36), | |
pos integer, | |
name character varying, | |
isproperty boolean, | |
primarytype character varying(250), | |
mixintypes character varying(250)[], | |
ischeckedin boolean, | |
baseversionid character varying(36), | |
majorversion bigint, | |
minorversion bigint, | |
isversion boolean | |
); | |
CREATE TABLE dublincore ( | |
id character varying(36) NOT NULL, | |
creator character varying, | |
nature character varying, | |
source character varying, | |
created timestamp without time zone, | |
description character varying, | |
rights character varying, | |
valid timestamp without time zone, | |
format character varying, | |
issued timestamp without time zone, | |
modified timestamp without time zone, | |
language character varying, | |
coverage character varying, | |
expired timestamp without time zone, | |
lastcontributor character varying, | |
title character varying | |
); | |
""" | |
def printDDL2(): | |
print """CREATE TABLE content ( | |
id character varying(36) NOT NULL, | |
name character varying, | |
length bigint, | |
data character varying(40), | |
"encoding" character varying, | |
digest character varying, | |
"mime-type" character varying | |
); | |
CREATE TABLE uid ( | |
id character varying(36) NOT NULL, | |
uid character varying | |
); | |
CREATE TABLE dc_contributors ( | |
id character varying(36) NOT NULL, | |
pos integer, | |
item character varying | |
); | |
CREATE TABLE file ( | |
id character varying(36) NOT NULL, | |
filename character varying | |
); | |
""" | |
def printDDL3(): | |
print """CREATE TABLE misc ( | |
id character varying(36) NOT NULL, | |
lifecyclepolicy character varying(250), | |
lifecyclestate character varying(250) | |
); | |
CREATE TABLE common ( | |
id character varying(36) NOT NULL, | |
icon character varying, | |
"icon-expanded" character varying, | |
size bigint | |
); | |
""" | |
def removeAll(): | |
print "delete from hierarchy where id like 'ffffff%';" | |
def imerge(items): | |
for item in items: | |
for i in item: | |
yield i | |
DDL = False; | |
dbp_files = [] | |
#for lang in ['fr', 'de', 'en']: | |
for lang in ['fr', 'de', 'en']: | |
dbp_files.append(fetch('long_abstracts', lang=lang)) | |
dbp_iters = [] | |
for dbp_file in dbp_files: | |
dbp_iters.append(extract_text(dbp_file).__iter__()) | |
dbpedia = imerge(dbp_iters) | |
if True: | |
if DDL: | |
printDDL() | |
genHierarchy() | |
genDublincore() | |
if True: | |
if DDL: | |
printDDL2() | |
genContent() | |
genUid() | |
genContributor() | |
genFile() | |
if True: | |
if DDL: | |
printDDL3() | |
genMisc() | |
genCommon() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment