Skip to content

Instantly share code, notes, and snippets.

View aagnone3's full-sized avatar
😸
so make it

Anthony Agnone aagnone3

😸
so make it
View GitHub Profile
@aagnone3
aagnone3 / sf_crime_9.py
Created October 30, 2020 00:18
sf_crime_9.py
profile = ProfileReport(train, minimal=True, title="SF Crime Data Set Profile")
profile.to_notebook_iframe()
@aagnone3
aagnone3 / sf_crime_8.py
Created October 30, 2020 00:16
sf_crime_8.py
plt.figure(figsize=(12, 8))
sns.distplot(train.loc[~train["TargetedCategory"], "RoadProba"], hist=False, label="Not Targeted")
sns.distplot(train.loc[train["TargetedCategory"], "RoadProba"], hist=False, label="Targeted")
plt.legend()
_ = plt.title("Label Separation for Log Road Probabilities")
@aagnone3
aagnone3 / sf_crime_7.py
Created October 30, 2020 00:16
sf_crime_7.py
# with the street probabilities, we can now assign them to each sample.
# as mentioned before, samples on street corners receive the mean of each street probability.
def assign_street_probabilities(address, probabilities):
return np.mean([
probabilities[clean_road(road)]
for road in address.split(" / ")
])
train["RoadProba"] = train["Address"].map(partial(assign_street_probabilities, probabilities=log_probas))
train.drop("Address", axis=1, inplace=True)
@aagnone3
aagnone3 / sf_crime_6.py
Created October 30, 2020 00:16
sf_crime_6.py
# finalize the log road probability feature
pd_counts = pd.Series(counts)
log_probas = np.log(pd_counts / pd_counts.sum())
# have a look at the distribution of log road probabilities in the data
plt.figure(figsize=(10, 10))
sns.displot(log_probas.values)
plt.xlabel('ln(P(road))')
plt.ylabel('P(x)')
_ = plt.title("Distribution of Log Probas for Street Occurrence", fontdict={'fontsize': 16})
@aagnone3
aagnone3 / sf_crime_5.py
Created October 30, 2020 00:15
sf_crime_5.py
train['IsOnBlock'] = train['Address'].str.contains('block', case=False)
train['IsAtIntersection'] = train['Address'].str.contains('/', case=False)
def clean_road(text):
return re.sub(r"[0-9]+ [bB]lock of ", "", text)
def make_counts(values):
counts = Counter()
for value in values:
cur_counts = list(map(clean_road, value.split(" / ")))
@aagnone3
aagnone3 / sf_crime_4.py
Created October 30, 2020 00:15
sf_crime_4.py
# drop unused columns
train.drop(["DayOfWeek", "Resolution", "Descript"], axis=1, inplace=True)
# target certain crime event categories
targeted_cats = [
'LARCENY/THEFT'
]
train["TargetedCategory"] = train.Category.isin(targeted_cats)
train.drop("Category", axis=1, inplace=True)
print(f"The {len(targeted_cats)} targeted categories occur in {100. * train.TargetedCategory.mean():.2f}% of the samples.")
@aagnone3
aagnone3 / sf_crime_3.py
Created October 30, 2020 00:14
sf_crime_3.py
random_state = 42
train = pd.read_csv("/datasets/s3-data-bucket/train.csv")
train.drop_duplicates(inplace=True)
train.reset_index(inplace=True, drop=True)
print(f"Loaded the dataset of {train.shape[1]}-D features")
test = pd.read_csv("/datasets/s3-data-bucket/test.csv", index_col='Id')
print(f"# train examples: {len(train)}\n# test examples: {len(test)}")
del test
@aagnone3
aagnone3 / sf_crime_2.py
Created October 30, 2020 00:13
sf_crime_2.py
import re
from datetime import datetime
from collections import Counter
from functools import partial
import urllib3.request
import warnings
warnings.simplefilter(action='ignore')
import joblib
import sklearn
@aagnone3
aagnone3 / sf_crime_1.py
Created October 30, 2020 00:13
sf_crime_1.py
import os
print(f"The contents of the S3 bucket connected to this notebook have been automatically transferred locally.\n"
f"S3 Bucket: s3://{os.environ['S3_BUCKET']}\n"
f"Local directory: /datasets/s3-data-bucket\n"
f"\nContents:")
!ls /datasets/s3-data-bucket
// see more at https://danfo.jsdata.org/examples/titanic-survival-prediction-using-danfo.js-and-tensorflow.js
const dfd = require("danfojs-node")
const tf = require("@tensorflow/tfjs-node")
async function load_process_data() {
let df = await dfd.read_csv("https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv")
//A feature engineering: Extract all titles from names columns
let title = df['Name'].apply((x) => { return x.split(".")[0] }).values