This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
profile = ProfileReport(train, minimal=True, title="SF Crime Data Set Profile") | |
profile.to_notebook_iframe() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(12, 8)) | |
sns.distplot(train.loc[~train["TargetedCategory"], "RoadProba"], hist=False, label="Not Targeted") | |
sns.distplot(train.loc[train["TargetedCategory"], "RoadProba"], hist=False, label="Targeted") | |
plt.legend() | |
_ = plt.title("Label Separation for Log Road Probabilities") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# with the street probabilities, we can now assign them to each sample. | |
# as mentioned before, samples on street corners receive the mean of each street probability. | |
def assign_street_probabilities(address, probabilities): | |
return np.mean([ | |
probabilities[clean_road(road)] | |
for road in address.split(" / ") | |
]) | |
train["RoadProba"] = train["Address"].map(partial(assign_street_probabilities, probabilities=log_probas)) | |
train.drop("Address", axis=1, inplace=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# finalize the log road probability feature | |
pd_counts = pd.Series(counts) | |
log_probas = np.log(pd_counts / pd_counts.sum()) | |
# have a look at the distribution of log road probabilities in the data | |
plt.figure(figsize=(10, 10)) | |
sns.displot(log_probas.values) | |
plt.xlabel('ln(P(road))') | |
plt.ylabel('P(x)') | |
_ = plt.title("Distribution of Log Probas for Street Occurrence", fontdict={'fontsize': 16}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train['IsOnBlock'] = train['Address'].str.contains('block', case=False) | |
train['IsAtIntersection'] = train['Address'].str.contains('/', case=False) | |
def clean_road(text): | |
return re.sub(r"[0-9]+ [bB]lock of ", "", text) | |
def make_counts(values): | |
counts = Counter() | |
for value in values: | |
cur_counts = list(map(clean_road, value.split(" / "))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# drop unused columns | |
train.drop(["DayOfWeek", "Resolution", "Descript"], axis=1, inplace=True) | |
# target certain crime event categories | |
targeted_cats = [ | |
'LARCENY/THEFT' | |
] | |
train["TargetedCategory"] = train.Category.isin(targeted_cats) | |
train.drop("Category", axis=1, inplace=True) | |
print(f"The {len(targeted_cats)} targeted categories occur in {100. * train.TargetedCategory.mean():.2f}% of the samples.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
random_state = 42 | |
train = pd.read_csv("/datasets/s3-data-bucket/train.csv") | |
train.drop_duplicates(inplace=True) | |
train.reset_index(inplace=True, drop=True) | |
print(f"Loaded the dataset of {train.shape[1]}-D features") | |
test = pd.read_csv("/datasets/s3-data-bucket/test.csv", index_col='Id') | |
print(f"# train examples: {len(train)}\n# test examples: {len(test)}") | |
del test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from datetime import datetime | |
from collections import Counter | |
from functools import partial | |
import urllib3.request | |
import warnings | |
warnings.simplefilter(action='ignore') | |
import joblib | |
import sklearn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
print(f"The contents of the S3 bucket connected to this notebook have been automatically transferred locally.\n" | |
f"S3 Bucket: s3://{os.environ['S3_BUCKET']}\n" | |
f"Local directory: /datasets/s3-data-bucket\n" | |
f"\nContents:") | |
!ls /datasets/s3-data-bucket |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// see more at https://danfo.jsdata.org/examples/titanic-survival-prediction-using-danfo.js-and-tensorflow.js | |
const dfd = require("danfojs-node") | |
const tf = require("@tensorflow/tfjs-node") | |
async function load_process_data() { | |
let df = await dfd.read_csv("https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv") | |
//A feature engineering: Extract all titles from names columns | |
let title = df['Name'].apply((x) => { return x.split(".")[0] }).values |