Alex Spanos alejio

## jupyter_presentation_compile
jupyter-nbconvert --to slides MySlideshow.ipynb --reveal-prefix=reveal.js

## git_commands.txt
# Add and commit all
git add -A && git commit -m "Your Message"
# Push to branch
git push -u origin <branch>
# undo commit
git reset HEAD~
# remove tracked file
git rm --cached

## joining_df.py
#Assumed imports:

import pandas as pd
#John Galt's answer is basically a reduce operation. If I have more than a handful of dataframes, I'd put them in a list like this (generated via list comprehensions or loops or whatnot):

dfs = [df0, df1, df2, dfN]
#Assuming they have some common column, like name in your example, I'd do the following:

df_final = reduce(lambda left,right: pd.merge(left,right,on='name'), dfs)
#That way, your code should work with whatever number of dataframes you want to merge.

## sqlite.R
library("RSQLite")
# connect to the sqlite file
con = dbConnect(RSQLite::SQLite(), dbname="database.sqlite")
# get a list of all tables
alltables = dbListTables(con)
# get tables as df
df.country = dbGetQuery( con,'select * from Country' )

## sqlite.py
import pandas as pd
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 120)
%matplotlib inline

## bnlearn_tan_example.R
library(data.table)
library(bnlearn)

df <- fread("file.csv", sep="|", verbose=TRUE)

cols <- colnames(df, 3:24)
df <- as.data.frame(df)
df_temp <- data.frame(apply(df, 2, as.factor))
df <- df_temp
rm(df_temp)

## supervised_learner.py
def supervised_learner(df, clf, train_list, testsize = 0.3, predictors=df.columns[1:], target=df.columns[-1]):
    ### Import packages
    from sklearn.cross_validation import train_test_split
    from sklearn import linear_model
    from sklearn.linear_model import SGDClassifier
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    import pandas as pd
    import numpy as np
    from sklearn.metrics import roc_curve

## df_get_dummies.py
def df_get_dummies(df, drops, cats, pkey=None, binary=1):

    ### Import packages
    import pandas as pd
    import numpy as np
    ###

    # The following columns aren't used in modelling so drop
    df = df.drop(drops, 1)
    # Ensure BPE and LE are categories before getting dummies

## df_sample_within_pkey.py
df_out = pd.DataFrame(columns=df.columns)
df_temp = pd.DataFrame(columns=df.columns)
for elem in a_list:
    df_temp =  df[df.pkey==elem]
    if len(df_temp) > a_number:
        df_temp_ind = df_temp.index.map(lambda x: x if np.random.binomial(1, prob_keep)==1 else None)
        df_temp = df_temp.loc[df_temp_ind,].dropna()
    else:
        pass
    df_out = df_out.append(df_temp, ignore_index=True)

## df_month_difference.py
# Inspired from here http://stackoverflow.com/questions/7015587/python-difference-of-2-datetimes-in-months
from datetime import datetime, timedelta
from calendar import monthrange
def monthdelta(df):
    d1 = df[0]
    d2 = df[1]
    delta = 0
    while True:
        mdays = monthrange(d1.year, d1.month)[1]
        d1 += timedelta(days=mdays)
	# Add and commit all
	git add -A && git commit -m "Your Message"
	# Push to branch
	git push -u origin <branch>
	# undo commit
	git reset HEAD~
	# remove tracked file
	git rm --cached
	#Assumed imports:

	import pandas as pd
	#John Galt's answer is basically a reduce operation. If I have more than a handful of dataframes, I'd put them in a list like this (generated via list comprehensions or loops or whatnot):

	dfs = [df0, df1, df2, dfN]
	#Assuming they have some common column, like name in your example, I'd do the following:

	df_final = reduce(lambda left,right: pd.merge(left,right,on='name'), dfs)
	#That way, your code should work with whatever number of dataframes you want to merge.
	library("RSQLite")
	# connect to the sqlite file
	con = dbConnect(RSQLite::SQLite(), dbname="database.sqlite")
	# get a list of all tables
	alltables = dbListTables(con)
	# get tables as df
	df.country = dbGetQuery( con,'select * from Country' )
	import pandas as pd
	from sqlalchemy import create_engine # database connection
	import datetime as dt
	from IPython.display import display
	import matplotlib
	import numpy as np
	import matplotlib.pyplot as plt
	pd.set_option('display.max_columns', 120)
	%matplotlib inline
	library(data.table)
	library(bnlearn)

	df <- fread("file.csv", sep="\|", verbose=TRUE)

	cols <- colnames(df, 3:24)
	df <- as.data.frame(df)
	df_temp <- data.frame(apply(df, 2, as.factor))
	df <- df_temp
	rm(df_temp)
	def supervised_learner(df, clf, train_list, testsize = 0.3, predictors=df.columns[1:], target=df.columns[-1]):
	### Import packages
	from sklearn.cross_validation import train_test_split
	from sklearn import linear_model
	from sklearn.linear_model import SGDClassifier
	from sklearn.metrics import confusion_matrix
	from sklearn.metrics import classification_report
	import pandas as pd
	import numpy as np
	from sklearn.metrics import roc_curve
	def df_get_dummies(df, drops, cats, pkey=None, binary=1):

	### Import packages
	import pandas as pd
	import numpy as np
	###

	# The following columns aren't used in modelling so drop
	df = df.drop(drops, 1)
	# Ensure BPE and LE are categories before getting dummies
	df_out = pd.DataFrame(columns=df.columns)
	df_temp = pd.DataFrame(columns=df.columns)
	for elem in a_list:
	df_temp = df[df.pkey==elem]
	if len(df_temp) > a_number:
	df_temp_ind = df_temp.index.map(lambda x: x if np.random.binomial(1, prob_keep)==1 else None)
	df_temp = df_temp.loc[df_temp_ind,].dropna()
	else:
	pass
	df_out = df_out.append(df_temp, ignore_index=True)
	# Inspired from here http://stackoverflow.com/questions/7015587/python-difference-of-2-datetimes-in-months
	from datetime import datetime, timedelta
	from calendar import monthrange
	def monthdelta(df):
	d1 = df[0]
	d2 = df[1]
	delta = 0
	while True:
	mdays = monthrange(d1.year, d1.month)[1]
	d1 += timedelta(days=mdays)