nilayparikh/rule_base_validation.py

## rule_base_validation.py
import pandas as pd
from datetime import datetime

# Sample market options data
options_data = pd.DataFrame({
    "symbol": ["XYZ", "XYZ", "XYZ"],
    "expiration": ["2023-01-27", "2023-02-17", "2023-01-20"],
    "strike": [35.00, 40.00, 45.00],
    "bid": [1.05, 2.15, 0.95],
    "ask": [1.15, 2.25, 1.05],
    "bidask_time": ["2023-01-01 09:30:10", "2023-01-01 09:31:05", "2023-01-01 09:32:01"]
})

# Define validation rules
MAX_BID_ASK_SPREAD = 0.15
MAX_PRICE_TIME_STALE = 300 # seconds

# Rule-based validation check

def validate_spread(row):

    if row.ask - row.bid > MAX_BID_ASK_SPREAD:
        return False, f"Spread exceeds threshold {MAX_BID_ASK_SPREAD}"

    return True, None

def validate_recency(row):

    ts = datetime.strptime(row.bidask_time, "%Y-%m-%d %H:%M:%S")
    now = datetime.now()
    delta = now - ts
    max_delta = datetime.timedelta(seconds=MAX_PRICE_TIME_STALE)

    if delta > max_delta:
        return False, f"Exceeds time threshold {MAX_PRICE_TIME_STALE} seconds"

    return True, None


for index, row in options_data.iterrows():

    spread_valid, spread_msg = validate_spread(row)

    if not spread_valid:
        print(f"Index {index} failed spread validation: {spread_msg}")

    recency_valid, recency_msg = validate_recency(row)

    if not recency_valid:
        print(f"Index {index} failed recency validation: {recency_msg}")

## statistical _checks.py
import pandas as pd
import numpy as np
from scipy import stats

# Sample historical bid-ask spread data
spread_data = pd.DataFrame(np.random.normal(loc=0.05, scale=0.01, size=(365,1)), columns=['spread'])

spread_data['date'] = pd.date_range(start='1/1/2020', periods=365)

# Calculate summary statistics by day:

daily_stats = spread_data.groupby('date').agg({
    'spread': [np.min, np.max, np.mean, np.std]
})

print(daily_stats.head())

# This gives us baselines like:

#              spread
#                 min      max      mean       std
# date
# 2020-01-01  0.028315  0.061907  0.049962  0.005974
# 2020-01-02  0.028597  0.063787  0.049265  0.005467
# 2020-01-03  0.033288  0.063906  0.050097  0.005025
# 2020-01-04  0.034413  0.065865  0.050150  0.006619
# 2020-01-05  0.031862  0.062944  0.049715  0.005151

# We can then use anomaly detection models like Z-scores to catch outliers:

z = (spread - mean) / std

anomaly_threshold = 3
anomalies = abs(z) > anomaly_threshold
	import pandas as pd
	from datetime import datetime

	# Sample market options data
	options_data = pd.DataFrame({
	"symbol": ["XYZ", "XYZ", "XYZ"],
	"expiration": ["2023-01-27", "2023-02-17", "2023-01-20"],
	"strike": [35.00, 40.00, 45.00],
	"bid": [1.05, 2.15, 0.95],
	"ask": [1.15, 2.25, 1.05],
	"bidask_time": ["2023-01-01 09:30:10", "2023-01-01 09:31:05", "2023-01-01 09:32:01"]
	})

	# Define validation rules
	MAX_BID_ASK_SPREAD = 0.15
	MAX_PRICE_TIME_STALE = 300 # seconds

	# Rule-based validation check

	def validate_spread(row):

	if row.ask - row.bid > MAX_BID_ASK_SPREAD:
	return False, f"Spread exceeds threshold {MAX_BID_ASK_SPREAD}"

	return True, None

	def validate_recency(row):

	ts = datetime.strptime(row.bidask_time, "%Y-%m-%d %H:%M:%S")
	now = datetime.now()
	delta = now - ts
	max_delta = datetime.timedelta(seconds=MAX_PRICE_TIME_STALE)

	if delta > max_delta:
	return False, f"Exceeds time threshold {MAX_PRICE_TIME_STALE} seconds"

	return True, None


	for index, row in options_data.iterrows():

	spread_valid, spread_msg = validate_spread(row)

	if not spread_valid:
	print(f"Index {index} failed spread validation: {spread_msg}")

	recency_valid, recency_msg = validate_recency(row)

	if not recency_valid:
	print(f"Index {index} failed recency validation: {recency_msg}")
	import pandas as pd
	import numpy as np
	from scipy import stats

	# Sample historical bid-ask spread data
	spread_data = pd.DataFrame(np.random.normal(loc=0.05, scale=0.01, size=(365,1)), columns=['spread'])

	spread_data['date'] = pd.date_range(start='1/1/2020', periods=365)

	# Calculate summary statistics by day:

	daily_stats = spread_data.groupby('date').agg({
	'spread': [np.min, np.max, np.mean, np.std]
	})

	print(daily_stats.head())

	# This gives us baselines like:

	# spread
	# min max mean std
	# date
	# 2020-01-01 0.028315 0.061907 0.049962 0.005974
	# 2020-01-02 0.028597 0.063787 0.049265 0.005467
	# 2020-01-03 0.033288 0.063906 0.050097 0.005025
	# 2020-01-04 0.034413 0.065865 0.050150 0.006619
	# 2020-01-05 0.031862 0.062944 0.049715 0.005151

	# We can then use anomaly detection models like Z-scores to catch outliers:

	z = (spread - mean) / std

	anomaly_threshold = 3
	anomalies = abs(z) > anomaly_threshold