Skip to content

Instantly share code, notes, and snippets.

@durandom
Created June 26, 2020 09:14
Show Gist options
  • Save durandom/f8c2dc12a487a0224a9d41f088ed261f to your computer and use it in GitHub Desktop.
Save durandom/f8c2dc12a487a0224a9d41f088ed261f to your computer and use it in GitHub Desktop.
infer hive schema from parquet file
def hive_schema(filename):
nl = ',\n'
diag_parq = fastparquet.ParquetFile(filename)
data = diag_parq.schema.text
schema = filter(None, data.split('\n')[1:])
schema = [re.sub(r'^[^a-z]+', ' ', l) for l in schema]
schema = [re.sub(r':', '', l) for l in schema]
schema = [re.sub(r'BYTE_ARRAY.*', 'STRING', l) for l in schema]
schema = [re.sub(r'INT64, TIMESTAMP_MICROS.*', 'TIMESTAMP', l) for l in schema]
schema = [re.sub(r'INT64, TIMESTAMP_MILLIS.*', 'TIMESTAMP', l) for l in schema]
schema = [re.sub(r'INT64, TIME_MICROS.*', 'BIGINT', l) for l in schema]
schema = [re.sub(r'INT64.*', 'BIGINT', l) for l in schema]
schema = [re.sub(r'DOUBLE.*', 'DOUBLE', l) for l in schema]
schema = [re.sub(r'BOOLEAN.*', 'BOOLEAN', l) for l in schema]
table = re.sub(r'([^.]+)\..*', r'\1', filename)
template = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS <TABLE_NAME>.{table} (
{nl.join(schema)}
)
STORED AS PARQUET
LOCATION 's3a://<SECRET_KEY>:<SECRET>@<BUCKET>/<PATH>/';
"""
return template
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment