Created
April 18, 2023 18:43
-
-
Save julie-mills/e3060b687c8a2a8b5abe13a2ceb261e5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from datetime import timedelta | |
from feast import Entity, FeatureView, Field, PushSource, ValueType, FileSource | |
from feast.types import String, Int64 | |
import time | |
def sanitize_and_write_to_parquet(csv_path: str, parquet_path: str): | |
# The timestamps for this dataset are logical and do not correspond to | |
# datetimes. In this demo, for Feast to understand we convert to a recent datetime that we | |
# can easily materialize. | |
df = pd.read_csv(csv_path) | |
timeval = time.time() - 5000 + df['timestamp'].astype(float) | |
df['timestamp']= pd.to_datetime(timeval, unit='s') | |
df.to_parquet(path=parquet_path, allow_truncated_timestamps=True, coerce_timestamps ='ms') | |
return df | |
csv_file_path = "data/labelled_2021may-ip-10-100-1-186.csv" | |
parquet_path = csv_file_path.split(".")[0] + "parquet" | |
df = sanitize_and_write_to_parquet(csv_file_path, parquet_path) | |
user_entity = Entity( | |
name="user", | |
description="Linux user id.", | |
value_type=ValueType.INT64, | |
join_keys=["userId"] | |
) | |
network_stats_source = FileSource( | |
name="security_honeypot_data_source", | |
path=parquet_path, | |
timestamp_field="timestamp", | |
) | |
# Read more about push sources here: https://docs.feast.dev/reference/data-sources/push | |
push_source = PushSource( | |
name="anomaly_stats_push_source", | |
batch_source=network_stats_source, | |
) | |
process_feature_view = FeatureView( | |
name="kernel_activity_features", | |
entities=[user_entity], | |
ttl=timedelta(hours=50), | |
schema=[ | |
Field(name="processName", dtype=String), | |
Field(name="processId", dtype=Int64), | |
Field(name="eventName", dtype=String), | |
], | |
online=True, | |
source=push_source, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment