Last active
August 29, 2015 14:12
-
-
Save rail44/37ea52adfbfeeb255e8b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# coding: utf-8 | |
class LtsvConverter: | |
"""The class to create parsed view of table that has single column with LTSV. | |
This excepts that all lines have same order of fields. | |
client: client of BigQuery-Python | |
""" | |
def __init__(self, client, dataset='', table='', field='f0'): | |
self.client = client | |
self.project = client.project_id | |
self.dataset = dataset | |
self.table = table | |
self.field = field | |
def convert(self): | |
query = self._get_query() | |
body = { | |
'tableReference': { | |
'tableId': '{}_parsed'.format(self.table), | |
'projectId': self.project, | |
'datasetId': self.dataset | |
}, | |
"view": { | |
"query": query | |
} | |
} | |
self.client.bigquery.tables().insert(body=body, datasetId=self.dataset, projectId=self.project).execute() | |
def _get_line(self): | |
job_id, _results = self.client.query('SELECT {} FROM [{}.{}] LIMIT 1'.format(self.field, self.dataset, self.table)) | |
complete, row_count = self.client.check_job(job_id) | |
results = self.client.get_query_rows(job_id) | |
return results[0][self.field] | |
def _get_keys(self): | |
line = self._get_line() | |
return [kv.split(':')[0] for kv in [f for f in line.split("\t")]] | |
def _get_query(self): | |
keys = self._get_keys() | |
selects = ', '.join(['NTH(2, SPLIT({}, ":")) {}'.format(k, k) for k in keys[1:]]) | |
fields = ',\n'.join(['LEAD(fields, {}) OVER (PARTITION BY f0 ORDER BY pos) {}'.format(i+1, k) for i, k in enumerate(keys[1:])]) | |
return ''' | |
SELECT NTH(2, SPLIT(fields, ":")) {}, {} FROM ( | |
SELECT | |
fields, | |
pos, | |
{} | |
FROM FLATTEN ((SELECT f0, fields, POSITION(fields) pos FROM ( | |
SELECT SPLIT({}, '\\t') fields, {} f0 FROM [{}.{}]) | |
), fields) GROUP BY f0, fields, pos | |
) where pos=1; | |
'''.format(keys[0], selects, fields, self.field, self.field, self.dataset, self.table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment