Last active
December 16, 2023 15:18
-
-
Save nilayparikh/848f3519f216f19f5b3c31bae08fe8c9 to your computer and use it in GitHub Desktop.
Data Quality by Design: A Comprehensive Approach from Ingestion to Value Generation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from datetime import datetime | |
# Sample market options data | |
options_data = pd.DataFrame({ | |
"symbol": ["XYZ", "XYZ", "XYZ"], | |
"expiration": ["2023-01-27", "2023-02-17", "2023-01-20"], | |
"strike": [35.00, 40.00, 45.00], | |
"bid": [1.05, 2.15, 0.95], | |
"ask": [1.15, 2.25, 1.05], | |
"bidask_time": ["2023-01-01 09:30:10", "2023-01-01 09:31:05", "2023-01-01 09:32:01"] | |
}) | |
# Define validation rules | |
MAX_BID_ASK_SPREAD = 0.15 | |
MAX_PRICE_TIME_STALE = 300 # seconds | |
# Rule-based validation check | |
def validate_spread(row): | |
if row.ask - row.bid > MAX_BID_ASK_SPREAD: | |
return False, f"Spread exceeds threshold {MAX_BID_ASK_SPREAD}" | |
return True, None | |
def validate_recency(row): | |
ts = datetime.strptime(row.bidask_time, "%Y-%m-%d %H:%M:%S") | |
now = datetime.now() | |
delta = now - ts | |
max_delta = datetime.timedelta(seconds=MAX_PRICE_TIME_STALE) | |
if delta > max_delta: | |
return False, f"Exceeds time threshold {MAX_PRICE_TIME_STALE} seconds" | |
return True, None | |
for index, row in options_data.iterrows(): | |
spread_valid, spread_msg = validate_spread(row) | |
if not spread_valid: | |
print(f"Index {index} failed spread validation: {spread_msg}") | |
recency_valid, recency_msg = validate_recency(row) | |
if not recency_valid: | |
print(f"Index {index} failed recency validation: {recency_msg}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from scipy import stats | |
# Sample historical bid-ask spread data | |
spread_data = pd.DataFrame(np.random.normal(loc=0.05, scale=0.01, size=(365,1)), columns=['spread']) | |
spread_data['date'] = pd.date_range(start='1/1/2020', periods=365) | |
# Calculate summary statistics by day: | |
daily_stats = spread_data.groupby('date').agg({ | |
'spread': [np.min, np.max, np.mean, np.std] | |
}) | |
print(daily_stats.head()) | |
# This gives us baselines like: | |
# spread | |
# min max mean std | |
# date | |
# 2020-01-01 0.028315 0.061907 0.049962 0.005974 | |
# 2020-01-02 0.028597 0.063787 0.049265 0.005467 | |
# 2020-01-03 0.033288 0.063906 0.050097 0.005025 | |
# 2020-01-04 0.034413 0.065865 0.050150 0.006619 | |
# 2020-01-05 0.031862 0.062944 0.049715 0.005151 | |
# We can then use anomaly detection models like Z-scores to catch outliers: | |
z = (spread - mean) / std | |
anomaly_threshold = 3 | |
anomalies = abs(z) > anomaly_threshold |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment