Skip to content

Instantly share code, notes, and snippets.

@nilayparikh
Last active December 16, 2023 15:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nilayparikh/848f3519f216f19f5b3c31bae08fe8c9 to your computer and use it in GitHub Desktop.
Save nilayparikh/848f3519f216f19f5b3c31bae08fe8c9 to your computer and use it in GitHub Desktop.
Data Quality by Design: A Comprehensive Approach from Ingestion to Value Generation
import pandas as pd
from datetime import datetime
# Sample market options data
options_data = pd.DataFrame({
"symbol": ["XYZ", "XYZ", "XYZ"],
"expiration": ["2023-01-27", "2023-02-17", "2023-01-20"],
"strike": [35.00, 40.00, 45.00],
"bid": [1.05, 2.15, 0.95],
"ask": [1.15, 2.25, 1.05],
"bidask_time": ["2023-01-01 09:30:10", "2023-01-01 09:31:05", "2023-01-01 09:32:01"]
})
# Define validation rules
MAX_BID_ASK_SPREAD = 0.15
MAX_PRICE_TIME_STALE = 300 # seconds
# Rule-based validation check
def validate_spread(row):
if row.ask - row.bid > MAX_BID_ASK_SPREAD:
return False, f"Spread exceeds threshold {MAX_BID_ASK_SPREAD}"
return True, None
def validate_recency(row):
ts = datetime.strptime(row.bidask_time, "%Y-%m-%d %H:%M:%S")
now = datetime.now()
delta = now - ts
max_delta = datetime.timedelta(seconds=MAX_PRICE_TIME_STALE)
if delta > max_delta:
return False, f"Exceeds time threshold {MAX_PRICE_TIME_STALE} seconds"
return True, None
for index, row in options_data.iterrows():
spread_valid, spread_msg = validate_spread(row)
if not spread_valid:
print(f"Index {index} failed spread validation: {spread_msg}")
recency_valid, recency_msg = validate_recency(row)
if not recency_valid:
print(f"Index {index} failed recency validation: {recency_msg}")
import pandas as pd
import numpy as np
from scipy import stats
# Sample historical bid-ask spread data
spread_data = pd.DataFrame(np.random.normal(loc=0.05, scale=0.01, size=(365,1)), columns=['spread'])
spread_data['date'] = pd.date_range(start='1/1/2020', periods=365)
# Calculate summary statistics by day:
daily_stats = spread_data.groupby('date').agg({
'spread': [np.min, np.max, np.mean, np.std]
})
print(daily_stats.head())
# This gives us baselines like:
# spread
# min max mean std
# date
# 2020-01-01 0.028315 0.061907 0.049962 0.005974
# 2020-01-02 0.028597 0.063787 0.049265 0.005467
# 2020-01-03 0.033288 0.063906 0.050097 0.005025
# 2020-01-04 0.034413 0.065865 0.050150 0.006619
# 2020-01-05 0.031862 0.062944 0.049715 0.005151
# We can then use anomaly detection models like Z-scores to catch outliers:
z = (spread - mean) / std
anomaly_threshold = 3
anomalies = abs(z) > anomaly_threshold
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment