Skip to content

Instantly share code, notes, and snippets.

@rasmi
Created January 11, 2019 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rasmi/f1651b34c7cc620d4718044be8029a25 to your computer and use it in GitHub Desktop.
Save rasmi/f1651b34c7cc620d4718044be8029a25 to your computer and use it in GitHub Desktop.
# Copyright 2018 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Create BigQuery schema for GAIA data types."""
def infer_types_from_webpage():
"""Infer GAIA schema (fields, datatypes, and descriptions) from DR2 webpage.
Returns:
A dict mapping from field name to a tuple of (BQ dtype, description).
"""
import requests
from lxml import html
PAGE_URL = (
'https://gea.esac.esa.int/archive/documentation/GDR2/Gaia_archive/'
'chap_datamodel/sec_dm_main_tables/ssec_dm_gaia_source.html')
page = requests.get(PAGE_URL)
root = html.document_fromstring(page.text)
content = root.find_class('ltx_page_content')[0]
paragraphs = content.cssselect('p.ltx_p')
text_elements = [element.text_content() for element in paragraphs]
field_descriptions_list = [text for text in text_elements if ' : ' in text]
field_descriptions = {field: description
for field, description in (
field_description.split(' : ')
for field_description in field_descriptions_list)}
BQ_TYPE_PATTERNS = {
'(float': 'FLOAT64',
'(double': 'FLOAT64',
'(short': 'INT64',
'(int': 'INT64',
'(long': 'INT64',
'(byte': 'INT64',
'(string': 'STRING',
'(boolean': 'BOOL'
}
def find_bq_type(text):
"""Check if the text contains a data type that maps to a BQ data type.
Args:
text: Input text to search for data types.
Returns:
A string of the BQ data type, if it exists in the input text.
"""
for pattern, bq_type in BQ_TYPE_PATTERNS.items():
if pattern in text:
return bq_type
type_and_description = {
field: (find_bq_type(description), description)
for field, description in field_descriptions.items()}
return type_and_description
def infer_types_from_csv(filename):
"""Infer GAIA schema (fields and datatypes) from a sample CSV file.
Args:
filename: Filename of a sample CSV file of GAIA data to infer types from.
Returns:
A dict mapping from field name to a tuple of (BQ dtype, description).
In this case, description is the empty string ''.
"""
import numpy as np
import pandas as pd
data = pd.read_csv(filename)
NP_BQ_TYPES = {
np.dtype('float64'): 'FLOAT64',
np.dtype('int64'): 'INT64',
np.dtype('object'): 'STRING',
np.dtype('bool'): 'BOOL'
}
type_and_description = {
field: (NP_BQ_TYPES[dtype], '') for field, dtype in data.dtypes.items()}
return type_and_description
def to_bq_schema(type_and_description):
"""Convert a dict of field names, dtypes, and descriptions to BQ schema json.
Args:
type_and_description: A dict mapping from field name
to a tuple of (BQ dtype, description).
Returns:
A list of dicts of the form {'name', 'type', 'mode', 'description'}.
"""
MODE = 'NULLABLE'
bq_schema = [
{'name': field, 'type': dtype, 'mode': MODE, 'description': description}
for field, (dtype, description) in type_and_description.items()]
return bq_schema
if __name__ == '__main__':
import json
types = infer_types_from_webpage()
bq_schema = to_bq_schema(types)
with open('schema.json', 'w') as output:
output.write(json.dumps(bq_schema))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment