Skip to content

Instantly share code, notes, and snippets.

@netj
Forked from quickbreach/mysqlToJson.py
Last active August 11, 2021 23:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save netj/96560d35c6e617fc1b3f21da43f06b2c to your computer and use it in GitHub Desktop.
Save netj/96560d35c6e617fc1b3f21da43f06b2c to your computer and use it in GitHub Desktop.
Convert mysqldump to JSON, optional upload to elastic
import re
import csv
import json
import sys
import requests
from requests.auth import HTTPBasicAuth
import hashlib
'''
Usage: 'python3 mysqlToJson.py <targetFile.sql>
About:
Parses an '.sql' file from mysqldump and converts it to JSON for elastic or other document DBS.
The mysqlFileToIter function parses the sql file into into python objects & arrays, which can be converted into JSON with json.dumps(arrayOfObjects)
Each resulting object represents a row from a table in the original SQL.
mysqlFileToIter adds the fields 'sqltojson_table_name' and 'sqltojson_source_file' to each document. keep or delete them as you wish, ie.
del(document['sqltojson_table_name'])
del(document['sqltojson_source_file'])
'''
# The brains of the operation
def mysqlFileToIter(sqlFile):
tac = dict()
currentTable = ''
inTableStructure = False
#
for line in open(sqlFile, "r"):
#
if re.search('^CREATE TABLE `\S.+`', line) != None:
inTableStructure = True
currentTable = re.search('^CREATE TABLE `\S.+`', line).group(0).replace('CREATE TABLE ', '').replace('`', '')
tac[currentTable] = []
continue
#
if inTableStructure:
#
if line.find('PRIMARY KEY') > -1 or line.find(' KEY `') > -1:
continue
# Create a map of each table name and it's associated columns. Each column name is preceeded by two spaces
if re.search('^\ \ `\S.+`', line) != None:
columnName = re.search('^\ \ `\S.+`', line).group(0).strip().replace('`', '')
tac[currentTable].append(columnName)
# This tells us that we are no longer in table description
if re.search('^\)', line) != None:
inTableStructure = False
continue
#
if re.search('^INSERT INTO `' + currentTable + '` VALUES', line) != None:
line = line[line.find("("):]
entriesArray = line.split("),(")
# Remove the semicolon at the end of the last entry
if (entriesArray[-1][-1] == ';'):
entriesArray[-1] = entriesArray[-1][:-1]
# iterate through the table rows
for entry in entriesArray:
try:
tmp = dict()
# Carve apart the row with the CSV parser
cvp = list(csv.reader([entry], quotechar="'", delimiter=',', escapechar='\\'))[0]
# Create
for x in range(0, len(cvp)):
columnName = tac[currentTable][x]
tmp[columnName] = cvp[x]
tmp['sqltojson_table_name'] = currentTable.replace(");", "")
tmp['sqltojson_source_file'] = sqlFile
yield tmp
except Exception as e:
print("Exception in line-49 loop: " + str(e), file=sys.stderr)
raise
# Takes in the array from sqlFileToArray, then converts each document in it to JSON and posts it to elastic
def sample_postToElastic(document):
indexName = ""
headers = {
'Content-Type': 'application/json'
}
# Choose your own adventure: you pick what index the data goes into. I'm recreating the speration of tables here
indexName = document['sqltojson_table_name']
# Helps avoid duplicate entries, especially if you run the job multiple times
s = hashlib.sha1()
s.update(json.dumps(document).encode('utf-8'))
docId = s.hexdigest()
# Posts the document to an index with the same name as the original table
resp = requests.post('http://127.0.0.1:9200/' + indexName + '/_doc/' + str(docId), auth = HTTPBasicAuth('username', 'password'), headers = headers, data = json.dumps(document))
if(resp.status_code != 200 and resp.status_code != 201):
# Something went wrong
print("Elastic returned non-successful response")
print(resp.__dict__)
exit(-1)
if len(sys.argv) < 2:
print("Usage: python3 mysqlToJson.py <targetFile.sql>")
exit(0)
# Convert the sql file to an array of JSON documents. Each will have an added 'sqltojson_table_name' field to identify which "table" it belongs to
convertedDb = mysqlFileToIter(sys.argv[1])
# Add each document to elastic. modify the sample_postToElastic to fit your needs
for document in convertedDb:
'''
# Modify the object as you wish here, ie add an '_id' field
s = hashlib.sha1()
s.update(json.dumps(document).encode('utf-8'))
document['_id'] = s.hexdigest()
'''
# Prints out the converted document
print(json.dumps(document))
# Add the document to your db
# sample_postToElastic(document)
# Modify sample_postToElastic for your elastic stack, or use it as guide for other NoSQL document DBs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment