brandomr/data_profiling.py

## data_profiling.py
import io
import pandas
import requests
import json

##########################
# generate dataset metadata
# I assume this will come from the HMI during user upload
##########################
dataset = {
  "username": "Adam Smith",
  "name": "COVID-19 Forecast Hub Ground Truth Data",
  "description": "COVID-19 case incidents, hospitalization incidents and cumulative deaths provided by COVID-19 Forecast Hub.",
  "file_names": [
    "forecast_hub_demo_data.csv"
  ],
  "source": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md",
  "dataset_url": "https://github.com/reichlab/covid19-forecast-hub/"
  }

# set openai key
openai_key = open('openai_key', 'r').read().strip()

# assume you've got forecast_hub_demo_data.csv (my example file) loaded as a pandas dataframe
# The MIT extraction service works best with just the top few lines of a dataset.
# 3 or 4 lines seems to be the sweet spot.
# not sure how uncharted will provide the CSV but if it's large we don't want to have to read in the whole thing
# you might have to tweak what's here to get it to be efficient
buffer = io.StringIO()
df.to_csv(buffer, index=False)
file_sample = buffer.getvalue()

# assume you've got some text document (e.g. about the data)
doc = 'my description about my data goes here'

######################################################
# Now we do the actual profiling!
######################################################

# Here we perform our first call to the MIT service
mit_url = 'http://100.26.10.46'

resp = requests.post(
    url=f"{mit_url}/annotation/link_dataset_col_to_dkg",
    params={"csv_str": file_sample, "doc": doc, "gpt_key": openai_key},
)
mit_groundings = resp.json()

# here we perform our 2nd call to the MIT service
resp = requests.post(
    url=f"{mit_url}/annotation/upload_file_extract/?gpt_key={openai_key}",
    files={"file": file_sample},
)
resp.json()
mit_annotations = {a['name']: a for a in resp.json()}


#######################################
# processing the results from MIT into the format
# expected by TDS
#######################################

columns = []
for c in df.columns:
    annotations = mit_annotations.get(c, {}).get("text_annotations", [])
    # Skip any single empty strings that are sometimes returned and drop extra items that are sometimes included (usually the string 'class')
    groundings = {g[0]: g[1] for g in mit_groundings.get(c, None).get('dkg_groundings', None) \
                if g and isinstance(g, list)}
    col = {
      "name": c,
      "data_type": "float",
      "description": annotations[0].strip(),
      "annotations": [],
      "metadata": {},
      "grounding": {
          "identifiers": groundings,
      },
    }
    columns.append(col)

dataset['columns'] = columns

dataset['metadata'] = {
    "documents": [
      {
        "url": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md",
        "title": "README: Ground truth data for the COVID-19 Forecast Hub"
      }
    ]
  }

#######################################
# adding dataset to TDS after profiling
#######################################

resp = requests.post(f"{tds_url}/datasets", json=dataset)
dataset_id = resp.json()['id']
resp.json()

# Let's get the pre-signed upload URL
query = {'filename': dataset['file_names'][0]}
resp = requests.get(f"{tds_url}/datasets/{dataset_id}/upload-url", params=query)
upload_url = resp.json()['url']
resp.json()

# now let's upload it
with open('forecast_hub_demo_data.csv', 'rb') as file:
    resp = requests.put(upload_url, data=file)
	import io
	import pandas
	import requests
	import json

	##########################
	# generate dataset metadata
	# I assume this will come from the HMI during user upload
	##########################
	dataset = {
	"username": "Adam Smith",
	"name": "COVID-19 Forecast Hub Ground Truth Data",
	"description": "COVID-19 case incidents, hospitalization incidents and cumulative deaths provided by COVID-19 Forecast Hub.",
	"file_names": [
	"forecast_hub_demo_data.csv"
	],
	"source": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md",
	"dataset_url": "https://github.com/reichlab/covid19-forecast-hub/"
	}

	# set openai key
	openai_key = open('openai_key', 'r').read().strip()

	# assume you've got forecast_hub_demo_data.csv (my example file) loaded as a pandas dataframe
	# The MIT extraction service works best with just the top few lines of a dataset.
	# 3 or 4 lines seems to be the sweet spot.
	# not sure how uncharted will provide the CSV but if it's large we don't want to have to read in the whole thing
	# you might have to tweak what's here to get it to be efficient
	buffer = io.StringIO()
	df.to_csv(buffer, index=False)
	file_sample = buffer.getvalue()

	# assume you've got some text document (e.g. about the data)
	doc = 'my description about my data goes here'

	######################################################
	# Now we do the actual profiling!
	######################################################

	# Here we perform our first call to the MIT service
	mit_url = 'http://100.26.10.46'

	resp = requests.post(
	url=f"{mit_url}/annotation/link_dataset_col_to_dkg",
	params={"csv_str": file_sample, "doc": doc, "gpt_key": openai_key},
	)
	mit_groundings = resp.json()

	# here we perform our 2nd call to the MIT service
	resp = requests.post(
	url=f"{mit_url}/annotation/upload_file_extract/?gpt_key={openai_key}",
	files={"file": file_sample},
	)
	resp.json()
	mit_annotations = {a['name']: a for a in resp.json()}



	#######################################
	# processing the results from MIT into the format
	# expected by TDS
	#######################################

	columns = []
	for c in df.columns:
	annotations = mit_annotations.get(c, {}).get("text_annotations", [])
	# Skip any single empty strings that are sometimes returned and drop extra items that are sometimes included (usually the string 'class')
	groundings = {g[0]: g[1] for g in mit_groundings.get(c, None).get('dkg_groundings', None) \
	if g and isinstance(g, list)}
	col = {
	"name": c,
	"data_type": "float",
	"description": annotations[0].strip(),
	"annotations": [],
	"metadata": {},
	"grounding": {
	"identifiers": groundings,
	},
	}
	columns.append(col)

	dataset['columns'] = columns

	dataset['metadata'] = {
	"documents": [
	{
	"url": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md",
	"title": "README: Ground truth data for the COVID-19 Forecast Hub"
	}
	]
	}

	#######################################
	# adding dataset to TDS after profiling
	#######################################

	resp = requests.post(f"{tds_url}/datasets", json=dataset)
	dataset_id = resp.json()['id']
	resp.json()

	# Let's get the pre-signed upload URL
	query = {'filename': dataset['file_names'][0]}
	resp = requests.get(f"{tds_url}/datasets/{dataset_id}/upload-url", params=query)
	upload_url = resp.json()['url']
	resp.json()

	# now let's upload it
	with open('forecast_hub_demo_data.csv', 'rb') as file:
	resp = requests.put(upload_url, data=file)