Skip to content

Instantly share code, notes, and snippets.

@aivuk
Created May 29, 2022 15:32
Show Gist options
  • Save aivuk/4b95c226b680264f0efae62aef0093da to your computer and use it in GitHub Desktop.
Save aivuk/4b95c226b680264f0efae62aef0093da to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from frictionless import Detector
import line_profiler
lp = line_profiler.LineProfiler()
def test_schema_from_synthetic_sparse_sample(confidence):
# For each type (integer, number, string) there are example of
# the type ("is") and examples of other type ("not")
type_sample = {
"integer": {"is": 1, "not": "string"},
"number": {"is": 3.14, "not": "string"},
"string": {"is": "string", "not": 1},
}
# Columns with type and confidence
columns = [
{"type": "integer", "conf": 0.7},
{"type": "number", "conf": 1},
{"type": "string", "conf": 1},
]
def generate_rows(num_rows=100000, columns=[]):
rows = []
num_per_type = [num_rows * c["conf"] for c in columns]
for i in range(num_rows):
row = []
for ci, col in enumerate(columns):
if i < num_per_type[ci]:
row.append(type_sample[col["type"]]["is"])
else:
row.append(type_sample[col["type"]]["not"])
rows.append(row)
return rows
sample = generate_rows(columns=columns)
detector = Detector(field_confidence=confidence)
labels = [f"field{i}" for i in range(1, 4)]
lp.add_function(detector.detect_schema)
schema = detector.detect_schema(sample, labels=labels)
assert schema == {
"fields": [
{
"name": f"field{i + 1}",
"type": columns[i]["type"] if columns[i]["conf"] >= confidence else "any",
}
for i in range(len(columns))
],
}
def wrapper_function():
test_schema_from_synthetic_sparse_sample(0.71)
wrapper = lp(wrapper_function)
wrapper()
lp.print_stats()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from frictionless import Detector
def test_schema_from_synthetic_sparse_sample(confidence):
# For each type (integer, number, string) there are example of
# the type ("is") and examples of other type ("not")
type_sample = {
"integer": {"is": 1, "not": "string"},
"number": {"is": 3.14, "not": "string"},
"string": {"is": "string", "not": 1},
}
# Columns with type and confidence
columns = [
{"type": "integer", "conf": 0.7},
{"type": "number", "conf": 1},
{"type": "string", "conf": 1},
]
def generate_rows(num_rows=100000, columns=[]):
rows = []
num_per_type = [num_rows * c["conf"] for c in columns]
for i in range(num_rows):
row = []
for ci, col in enumerate(columns):
if i < num_per_type[ci]:
row.append(type_sample[col["type"]]["is"])
else:
row.append(type_sample[col["type"]]["not"])
rows.append(row)
return rows
sample = generate_rows(columns=columns)
detector = Detector(field_confidence=confidence)
labels = [f"field{i}" for i in range(1, 4)]
schema = detector.detect_schema(sample, labels=labels)
assert schema == {
"fields": [
{
"name": f"field{i + 1}",
"type": columns[i]["type"] if columns[i]["conf"] >= confidence else "any",
}
for i in range(len(columns))
],
}
test_schema_from_synthetic_sparse_sample(0.71)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment