Skip to content

Instantly share code, notes, and snippets.

View nkthiebaut's full-sized avatar

Nicolas nkthiebaut

View GitHub Profile
@nkthiebaut
nkthiebaut / pyspark_dataframe_register.py
Created June 28, 2019 18:44
Instantiate a Spark Session, register a DataFrame, and query it (Spark 2.0+).
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame as SparkDataFrame
sc = SparkConf()
sc.set("spark.driver.memory", "4g")
ss = SparkSession.builder.master("local[4]").config(conf=sc).getOrCreate()
df = ss.createDataFrame([(1, "kevin"), (2, "steph")], ["id", "name"])
df.createOrReplaceTempView("players")
@nkthiebaut
nkthiebaut / test_rollbar_setup.py
Created July 1, 2019 23:34
Test Rollbar configuration
from unittest.mock import patch
import rollbar
@patch("rollbar.log.exception")
def test_rollbar_connection(log_exception_mock):
"""
Test sending an exception to Rollbar. The Rollbar Python SDK reports requests
errors through exceptions logs. Here we check that the exception function
of the logging module has NOT been called after reporting an exception to Rollbar.
@nkthiebaut
nkthiebaut / xkcd-trend.py
Last active July 2, 2019 23:56
Get and plot Google scholar search queries volume, for different keywords, with the XKCD plot style. Results from direct queries to the Google scholar APIs.
"""Based on https://github.com/Pold87/academic-keyword-occurrence"""
import re
import urllib
from functools import partial
from typing import Iterable
from urllib.parse import urlencode
from urllib.request import Request, build_opener
import matplotlib
import matplotlib.pyplot as plt
@nkthiebaut
nkthiebaut / plot_top_k_accuracies
Created July 14, 2019 01:34
Plot top k accuracies
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
plt.xkcd()
X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
@nkthiebaut
nkthiebaut / soc
Created July 14, 2019 05:35
Codes to names mapping for the O*NET-SOC job titles classification (https://www.onetcenter.org/taxonomy.html)
SOC_MAJOR_GROUPS = {
"11": "Management Occupations",
"13": "Business and Financial Operations Occupations",
"15": "Computer and Mathematical Occupations",
"17": "Architecture and Engineering Occupations",
"19": "Life, Physical, and Social Science Occupations",
"21": "Community and Social Service Occupations",
"23": "Legal Occupations",
"25": "Education, Training, and Library Occupations",
"27": "Arts, Design, Entertainment, Sports, and Media Occupations",
@nkthiebaut
nkthiebaut / commit.APPLESCRIPT
Created July 24, 2019 17:27
Commit applescript
on run
set result to text returned of (display dialog "Enter github password:" default answer "" with hidden answer)
do shell script "cd /Users/nicolas/Google\\ Drive/notes && git add . && git commit -m new_commit && git push https://nkthiebaut:" & result & "@github.com/nkthiebaut/notes.git"
say "Notes successfully committed and pushed"
end run
@nkthiebaut
nkthiebaut / hyperopt.py
Created December 30, 2019 19:35
Hyperopt usage example with sklearn
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
def hyperopt_train_test(params):
t = params['type']
del params['type']
if t == 'gb':
clf = GradientBoostingClassifier(**params)
@nkthiebaut
nkthiebaut / plot_graph.py
Created January 19, 2020 21:06
Plot a graph with Matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import networkx as nx
def plot_graph(df, directed=False):
graph_engine = nx.DiGraph() if directed else None
G = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr=True, create_using=graph_engine)
costs = G.edges.data('cost')
@nkthiebaut
nkthiebaut / pandas_json_to_dataframe.py
Created March 23, 2020 20:13
Pandas : column of JSON strings to DataFrame
# inspired by https://stackoverflow.com/a/50658993/5174617
import pandas as pd
import json
df = pd.DataFrame([['0 {"a":"1","b":"2","c":"3"}'],['1 {"a" :"4","b":"5","c":"6"}']], columns=['json'])
exploded_df = df['json'].apply(json.loads).apply(pd.Series)
# Official doc: https://docs.python.org/3/howto/logging-cookbook.html
import os
import logging
from logging import StreamHandler
from logging import Formatter
LOGS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../logs')
if not os.path.exists(LOGS_DIR):