Skip to content

Instantly share code, notes, and snippets.

View pierrelouisbescond's full-sized avatar

Pierre-Louis BESCOND pierrelouisbescond

View GitHub Profile
# We extract the list of duplicates files to remove
duplicates = pd.read_excel("./roman-numerals-labeling-plb-20210830.xlsx", sheet_name="duplicates")
duplicates_list = duplicates["file"].tolist()
# as well as the unreadable files
files_analysis = pd.read_excel("./roman-numerals-labeling-plb-20210830.xlsx", sheet_name="analysis", usecols="B:L")
erroneous_list = files_analysis["file"][files_analysis["to_be_removed"] == 1].tolist()
removal_list = duplicates_list + erroneous_list
print(len(duplicates_list), "duplicates +", len(erroneous_list), "errouneous =", len(removal_list), "pictures to remove.")
DATA_FOLDER = "./data/"
INITIAL_FOLDERS = ["train", "val", "label_book"]
FOLDERS = ["train", "val"]
LABELS = ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"]
initial_folder = FOLDER(DATA_FOLDER, INITIAL_FOLDERS, LABELS)
initial_folder.summary()
class FOLDER():
def __init__(self, DATA_FOLDER, FOLDERS, LABELS):
self.DATA_FOLDER = DATA_FOLDER
self.FOLDERS = FOLDERS
self.LABELS = LABELS
def summary(self, display_ratio=4):
for folder in self.FOLDERS:
@pierrelouisbescond
pierrelouisbescond / dcc_final_dataset_estimate.ipynb
Last active August 23, 2021 10:31
Data-Centric Competition - Final DataSet Estimate
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
print("TRAIN PERFORMANCE:\n")
confusion_matrix_train = confusion_matrix(y_train, model.predict(X_train))
confusion_matrix_train = pd.DataFrame(confusion_matrix_train,
index=["Actual_No","Actual_Yes"],
columns=["Predicted_No","Predicted_Yes"])
display(confusion_matrix_train)
recall_resignation_train = confusion_matrix_train.iloc[1,1] / confusion_matrix_train.iloc[1,:].sum()
print("Train Score: {}".format(round(model.score(X_train,y_train),3)))
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from fdasrsf import fPCA, time_warping, fdawarp, fdahpca
# Functional Alignment
# Align time-series
warp_f = time_warping.fdawarp(f, time)
warp_f.srsf_align()
warp_f.plot()
# Functional Principal Components Analysis
import pandas as pd
import numpy as np
# Import the CSV file with only useful columns
# source: https://www.data.gouv.fr/fr/datasets/temperature-quotidienne-departementale-depuis-janvier-2018/
df = pd.read_csv("temperature-quotidienne-departementale.csv", sep=";", usecols=[0,1,4])
# Rename columns to simplify syntax
df = df.rename(columns={"Code INSEE département": "Region", "TMax (°C)": "Temp"})
# -*- coding: utf-8 -*-
# We start with the import of standard ML librairies
import pandas as pd
import numpy as np
import math
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor