Skip to content

Instantly share code, notes, and snippets.

@u1735067
Created January 20, 2016 00:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save u1735067/b06cec878f44f9a2d403 to your computer and use it in GitHub Desktop.
Save u1735067/b06cec878f44f9a2d403 to your computer and use it in GitHub Desktop.
ArangoDB : test d'import (unitaire et en masse, beaucoup plus rapide)
#!python3
import sys, glob, os, re, datetime, time, json
import traceback
import arango
from arango import Arango
client = Arango(host="localhost", port=8529)
db = client.db('jeu')
types = ['combinaisons', 'masse-enjeu', 'participants', 'rapports-definitifs', 'courses', 'tirelire', 'reunion', 'pronostics-detailles', 'performances-detaillees']
for t in types:
try:
db.create_collection(t)
except:
continue
#files = sorted(glob.glob('data/20130311*'), key=os.path.basename)
file_mask = re.compile('(?P<key>(?P<date>[0-9]{8})_(R(?P<reunion>[0-9]+)-)?(C(?P<course>[0-9]+)-)?)(?P<type>\S+)\.json')
def process_file(file, data):
res = re.match(file_mask, os.path.basename(file))
dateYmd = res.group('date')
key = res.group('key')
if (key[-1:] == '-' or key[-1:] == '_'):
key = key[:-1]
reunion = res.group('reunion')
if (reunion is not None):
reunion = int(reunion)
course = res.group('course')
if (course is not None):
course = int(course)
type = res.group('type')
timestamp = str(int(time.mktime(datetime.datetime.strptime(dateYmd, '%Y%m%d').timetuple())))+'000' #Win
# timestamp = datetime.datetime.strptime(dateYmd, '%Y%m%d').strftime('%s000')
if type == 'courses':
data = data['programme']
elif type == 'reunion':
data['date'] = data['dateReunion']
data['numeroReunion'] = reunion
elif (type == 'participants') or (type == 'pronostics-detailles'):
data['date'] = timestamp
data['timezoneOffset'] = 3600000
data['numeroReunion'] = reunion
data['numeroCourse'] = course
elif type == 'masse-enjeu':
data= {'enjeu': data}
data['date'] = timestamp
data['timezoneOffset'] = 3600000
data['numeroReunion'] = reunion
data['numeroCourse'] = course
elif type == 'rapports-definitifs':
data = {'rapports': data}
data['date'] = timestamp
data['timezoneOffset'] = 3600000
data['numeroReunion'] = reunion
data['numeroCourse'] = course
elif type == 'combinaisons':
data['date'] = data['dateProgramme']
elif type == 'performances-detaillees':
data['date'] = timestamp
data['numeroReunion'] = reunion
data['numeroCourse'] = course
elif type == 'tirelire':
data = data
else:
raise Exception('type inconnu')
data['dateYmd'] = int(dateYmd)
data['_key'] = key
return data
commit_interval = 5000
i = 0
nb_files = len(sorted(glob.glob('data/*.json'), key=os.path.basename))
for type in types:
j = 0
collection = db.col(type)
files = sorted(glob.glob('data/*'+type+'.json'), key=os.path.basename)
docs = []
for file in files:
try:
with open(file, 'r', encoding="utf-8") as f:
data = json.load(f)
docs.append(process_file(file, data))
print('Read file '+file)
if (j % commit_interval == 0):
print('Read '+str(commit_interval)+' files, sending')
collection.import_documents(docs, complete=False, details=True)
docs = []
except arango.exceptions.DocumentCreateError as e:
if (e.args[0] == 'cannot create document, unique constraint violated'):
print('Failure with '+file+' : already added')
else:
print('/!\ Failure with '+file+' : ')
raise
except:
print('/!\ Failure with '+file+' : ')
# +str(traceback.print_exc())
raise
i += 1
j += 1
if (j % 500 == 0):
print('Done '+type+' : '+str(j)+'/'+str(len(files))+' ('+str(int(j/len(files)*100))+'%) / '+str(i)+'/'+str(nb_files)+' ('+str(int(i/nb_files*100))+'%)', file=sys.stderr)
print('Done '+type+' : '+str(j)+'/'+str(len(files))+' ('+str(int(j/len(files)*100))+'%) / '+str(i)+'/'+str(nb_files)+' ('+str(int(i/nb_files*100))+'%)', file=sys.stderr)
collection.import_documents(docs, complete=False, details=True)
'''
FOR i IN reunion
FILTER i.meteo != null
RETURN {'date': i.dateYmd, 'meteo': i.meteo}
https://www.arangodb.com/download/
! https://github.com/joowani/python-arango
https://github.com/saeschdivara/ArangoPy
https://docs.arangodb.com/cookbook/XCopyInstallWindows.html
http://127.0.0.1:8529/_db/jeu/_admin/aardvark/standalone.html#collection/combinaisons/20141123_R2-C8
http://api.mongodb.org/python/current/api/pymongo/database.html
https://stackoverflow.com/questions/15478127/remove-final-character-from-string-python
https://stackoverflow.com/questions/19801727/convert-datetime-to-unix-timestamp-and-convert-it-back-in-python
https://stackoverflow.com/questions/23086383/how-to-test-nonetype-in-python
https://docs.arangodb.com/HttpBulkImports/index.html
https://www.arangodb.com/2012/09/bulk-insert-benchmark-tool/
http://vschart.com/compare/arangodb/vs/mongodb/vs/couchbase
https://docs.arangodb.com/IndexHandling/Geo.html
http://fr.slideshare.net/arangodb/introduction-to-column-oriented-databases
https://dzone.com/articles/introducing-arangodb
https://dzone.com/articles/practical-tips-to-reduce-sql-server-database-table
http://blog.sqlauthority.com/2015/11/30/sql-server-practical-tips-to-reduce-sql-server-database-table-size-experts-opinion/
'''
#!python3
import sys, glob, os, re, datetime, time, json
import traceback
import arango
from arango import Arango
client = Arango(host="localhost", port=8529)
db = client.db('jeu')
for t in ['combinaisons', 'masse-enjeu', 'participants', 'rapports-definitifs', 'courses', 'tirelire', 'reunion', 'pronostics-detailles', 'performances-detaillees']:
try:
db.create_collection(t)
except:
continue
files = sorted(glob.glob('data/*'), key=os.path.basename)
#files = sorted(glob.glob('data/20130311*'), key=os.path.basename)
file_mask = re.compile('(?P<key>(?P<date>[0-9]{8})_(R(?P<reunion>[0-9]+)-)?(C(?P<course>[0-9]+)-)?)(?P<type>\S+)\.json')
def process_file(file, data):
res = re.match(file_mask, os.path.basename(file))
dateYmd = res.group('date')
key = res.group('key')
if (key[-1:] == '-' or key[-1:] == '_'):
key = key[:-1]
reunion = res.group('reunion')
if (reunion is not None):
reunion = int(reunion)
course = res.group('course')
if (course is not None):
course = int(course)
type = res.group('type')
timestamp = str(int(time.mktime(datetime.datetime.strptime(dateYmd, '%Y%m%d').timetuple())))+'000' #Win
# timestamp = datetime.datetime.strptime(dateYmd, '%Y%m%d').strftime('%s000')
if type == 'courses':
data = data['programme']
elif type == 'reunion':
data['date'] = data['dateReunion']
data['numeroReunion'] = reunion
elif (type == 'participants') or (type == 'pronostics-detailles'):
data['date'] = timestamp
data['timezoneOffset'] = 3600000
data['numeroReunion'] = reunion
data['numeroCourse'] = course
elif type == 'masse-enjeu':
data= {'enjeu': data}
data['date'] = timestamp
data['timezoneOffset'] = 3600000
data['numeroReunion'] = reunion
data['numeroCourse'] = course
elif type == 'rapports-definitifs':
data = {'rapports': data}
data['date'] = timestamp
data['timezoneOffset'] = 3600000
data['numeroReunion'] = reunion
data['numeroCourse'] = course
elif type == 'combinaisons':
data['date'] = data['dateProgramme']
elif type == 'performances-detaillees':
data['date'] = timestamp
data['numeroReunion'] = reunion
data['numeroCourse'] = course
elif type == 'tirelire':
data = data
else:
raise Exception('type inconnu')
data['dateYmd'] = dateYmd
data['_key'] = key
collection = db.col(type)
collection.create_document(data)
i = 0
for file in files:
try:
with open(file, 'r', encoding="utf-8") as f:
data = json.load(f)
process_file(file, data)
print('Done with '+file)
except arango.exceptions.DocumentCreateError as e:
if (e.args[0] == 'cannot create document, unique constraint violated'):
print('Failure with '+file+' : already added')
else:
print('/!\ Failure with '+file+' : ')
raise
except:
print('/!\ Failure with '+file+' : ')
# +str(traceback.print_exc())
raise
i += 1
if (i % 500 == 0):
print('Done: '+str(i)+'/'+str(len(files))+' ('+str(int(i/len(files)*100))+'%)', file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment