Skip to content

Instantly share code, notes, and snippets.

@julie-mills
Created April 18, 2023 18:43
Show Gist options
  • Save julie-mills/e3060b687c8a2a8b5abe13a2ceb261e5 to your computer and use it in GitHub Desktop.
Save julie-mills/e3060b687c8a2a8b5abe13a2ceb261e5 to your computer and use it in GitHub Desktop.
import pandas as pd
from datetime import timedelta
from feast import Entity, FeatureView, Field, PushSource, ValueType, FileSource
from feast.types import String, Int64
import time
def sanitize_and_write_to_parquet(csv_path: str, parquet_path: str):
# The timestamps for this dataset are logical and do not correspond to
# datetimes. In this demo, for Feast to understand we convert to a recent datetime that we
# can easily materialize.
df = pd.read_csv(csv_path)
timeval = time.time() - 5000 + df['timestamp'].astype(float)
df['timestamp']= pd.to_datetime(timeval, unit='s')
df.to_parquet(path=parquet_path, allow_truncated_timestamps=True, coerce_timestamps ='ms')
return df
csv_file_path = "data/labelled_2021may-ip-10-100-1-186.csv"
parquet_path = csv_file_path.split(".")[0] + "parquet"
df = sanitize_and_write_to_parquet(csv_file_path, parquet_path)
user_entity = Entity(
name="user",
description="Linux user id.",
value_type=ValueType.INT64,
join_keys=["userId"]
)
network_stats_source = FileSource(
name="security_honeypot_data_source",
path=parquet_path,
timestamp_field="timestamp",
)
# Read more about push sources here: https://docs.feast.dev/reference/data-sources/push
push_source = PushSource(
name="anomaly_stats_push_source",
batch_source=network_stats_source,
)
process_feature_view = FeatureView(
name="kernel_activity_features",
entities=[user_entity],
ttl=timedelta(hours=50),
schema=[
Field(name="processName", dtype=String),
Field(name="processId", dtype=Int64),
Field(name="eventName", dtype=String),
],
online=True,
source=push_source,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment