Created
May 29, 2022 15:32
-
-
Save aivuk/4b95c226b680264f0efae62aef0093da to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from frictionless import Detector | |
import line_profiler | |
lp = line_profiler.LineProfiler() | |
def test_schema_from_synthetic_sparse_sample(confidence): | |
# For each type (integer, number, string) there are example of | |
# the type ("is") and examples of other type ("not") | |
type_sample = { | |
"integer": {"is": 1, "not": "string"}, | |
"number": {"is": 3.14, "not": "string"}, | |
"string": {"is": "string", "not": 1}, | |
} | |
# Columns with type and confidence | |
columns = [ | |
{"type": "integer", "conf": 0.7}, | |
{"type": "number", "conf": 1}, | |
{"type": "string", "conf": 1}, | |
] | |
def generate_rows(num_rows=100000, columns=[]): | |
rows = [] | |
num_per_type = [num_rows * c["conf"] for c in columns] | |
for i in range(num_rows): | |
row = [] | |
for ci, col in enumerate(columns): | |
if i < num_per_type[ci]: | |
row.append(type_sample[col["type"]]["is"]) | |
else: | |
row.append(type_sample[col["type"]]["not"]) | |
rows.append(row) | |
return rows | |
sample = generate_rows(columns=columns) | |
detector = Detector(field_confidence=confidence) | |
labels = [f"field{i}" for i in range(1, 4)] | |
lp.add_function(detector.detect_schema) | |
schema = detector.detect_schema(sample, labels=labels) | |
assert schema == { | |
"fields": [ | |
{ | |
"name": f"field{i + 1}", | |
"type": columns[i]["type"] if columns[i]["conf"] >= confidence else "any", | |
} | |
for i in range(len(columns)) | |
], | |
} | |
def wrapper_function(): | |
test_schema_from_synthetic_sparse_sample(0.71) | |
wrapper = lp(wrapper_function) | |
wrapper() | |
lp.print_stats() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from frictionless import Detector | |
def test_schema_from_synthetic_sparse_sample(confidence): | |
# For each type (integer, number, string) there are example of | |
# the type ("is") and examples of other type ("not") | |
type_sample = { | |
"integer": {"is": 1, "not": "string"}, | |
"number": {"is": 3.14, "not": "string"}, | |
"string": {"is": "string", "not": 1}, | |
} | |
# Columns with type and confidence | |
columns = [ | |
{"type": "integer", "conf": 0.7}, | |
{"type": "number", "conf": 1}, | |
{"type": "string", "conf": 1}, | |
] | |
def generate_rows(num_rows=100000, columns=[]): | |
rows = [] | |
num_per_type = [num_rows * c["conf"] for c in columns] | |
for i in range(num_rows): | |
row = [] | |
for ci, col in enumerate(columns): | |
if i < num_per_type[ci]: | |
row.append(type_sample[col["type"]]["is"]) | |
else: | |
row.append(type_sample[col["type"]]["not"]) | |
rows.append(row) | |
return rows | |
sample = generate_rows(columns=columns) | |
detector = Detector(field_confidence=confidence) | |
labels = [f"field{i}" for i in range(1, 4)] | |
schema = detector.detect_schema(sample, labels=labels) | |
assert schema == { | |
"fields": [ | |
{ | |
"name": f"field{i + 1}", | |
"type": columns[i]["type"] if columns[i]["conf"] >= confidence else "any", | |
} | |
for i in range(len(columns)) | |
], | |
} | |
test_schema_from_synthetic_sparse_sample(0.71) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment