Pier lppier

## countxlsx.py
import os

total_qns = 0
rootdir = '.'

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        path = os.path.join(subdir, file)
        print(path)
        if path.endswith('.xlsx'):

## cloudera_python36_install
> yum install centos-release-scl
> yum info rh-python36
> scl enable rh-python36 bash
> python --version

Run PySpark2

> pyspark2

Python 3.6.3 (default, Apr 26 2018, 13:16:02)

## detect_percentage_english.py
import string
import urllib.request
from nltk.corpus import words

punctuation = set(string.punctuation)

def remove_punc(str):
    return ''.join(c for c in str if c not in punctuation)

total_count = 0

## frac-diff_sk
"""
Python code for fractional differencing of pandas time series
illustrating the concepts of the article "Preserving Memory in Stationary Time Series"
by Simon Kuttruf

While this code is dedicated to the public domain for use without permission, the author disclaims any liability in connection with the use of this code.
"""

import numpy as np
import pandas as pd

## write_to_cassandra.scala
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.types.{BooleanType, FloatType, IntegerType, LongType, StringType, StructField, StructType}
import org.apache.spark.sql.{SaveMode, SparkSession}

object BatchPredict {

## stopping_criteria.py


def showPlot(points, filename): # pier mod
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.savefig(filename)

## useful_sql.sql
-- Remove all backup tables
DROP TABLE IF EXISTS [WKBRZ].[dbo].[PredictionNetworkBookingPax_Backup];

-- Create a backup
select * into [WKBRZ].[dbo].[PredictionNetworkBookingPax_Backup] from [WKBRZ].[dbo].[PredictionNetworkBookingPax]

-- Restore from the backup tables
INSERT INTO [WKBRZ].[dbo].[PredictionNetworkBookingPax] SELECT * FROM [WKBRZ].[dbo].[PredictionNetworkBookingPax_Backup]

-- Delete all prediction table contents

## avoid_copywithsetting_error.py
#SettingWithCopyWarning:
#A value is trying to be set on a copy of a slice from a DataFrame.
#Try using .loc[row_indexer,col_indexer] = value instead

# assign was introduced in pandas 0.16 to deal with this false positive

# Instead of
df_weekly_season.loc[:, 'market'] = market
# or
df_weekly_season['market'] = market

## datetime_conversions.py
import calendar
df_weekly_season['DOW_ENG'] = weekly_season['ds'].apply(lambda x : calendar.day_name[pd.to_datetime(x).weekday()])
df_weekly_season['DOW'] = weekly_season['ds'].apply(lambda x : int(pd.to_datetime(x).strftime('%w'))) # SUN == 0
df_weekly_season['DOW'] = weekly_season['ds'].apply(lambda x : int(pd.to_datetime(x).weekday()) # MON == 0

## pretty_seaborn_bar_chart.py
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style

style.use('ggplot')

f, ax1 = plt.subplots(1, 1, figsize=(20, 7), sharex=True)
df_mth_plot = pd.DataFrame(list(monthly_growth_dict.items()), columns=['Market', 'Average Monthly Growth Rate %'])
df_mth_plot = df_mth_plot.sort_values('Average Monthly Growth Rate %', ascending=False)
	import os

	total_qns = 0
	rootdir = '.'

	for subdir, dirs, files in os.walk(rootdir):
	for file in files:
	path = os.path.join(subdir, file)
	print(path)
	if path.endswith('.xlsx'):
	> yum install centos-release-scl
	> yum info rh-python36
	> scl enable rh-python36 bash
	> python --version

	Run PySpark2

	> pyspark2

	Python 3.6.3 (default, Apr 26 2018, 13:16:02)
	import string
	import urllib.request
	from nltk.corpus import words

	punctuation = set(string.punctuation)

	def remove_punc(str):
	return ''.join(c for c in str if c not in punctuation)

	total_count = 0
	"""
	Python code for fractional differencing of pandas time series
	illustrating the concepts of the article "Preserving Memory in Stationary Time Series"
	by Simon Kuttruf

	While this code is dedicated to the public domain for use without permission, the author disclaims any liability in connection with the use of this code.
	"""

	import numpy as np
	import pandas as pd
	import org.apache.spark.{SparkConf, SparkContext}
	import org.apache.spark.ml.Pipeline
	import org.apache.spark.ml.evaluation.RegressionEvaluator
	import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
	import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
	import org.apache.spark.sql.functions.lit
	import org.apache.spark.sql.types.{BooleanType, FloatType, IntegerType, LongType, StringType, StructField, StructType}
	import org.apache.spark.sql.{SaveMode, SparkSession}

	object BatchPredict {


	def showPlot(points, filename): # pier mod
	plt.figure()
	fig, ax = plt.subplots()
	# this locator puts ticks at regular intervals
	loc = ticker.MultipleLocator(base=0.2)
	ax.yaxis.set_major_locator(loc)
	plt.plot(points)
	plt.savefig(filename)
	-- Remove all backup tables
	DROP TABLE IF EXISTS [WKBRZ].[dbo].[PredictionNetworkBookingPax_Backup];

	-- Create a backup
	select * into [WKBRZ].[dbo].[PredictionNetworkBookingPax_Backup] from [WKBRZ].[dbo].[PredictionNetworkBookingPax]

	-- Restore from the backup tables
	INSERT INTO [WKBRZ].[dbo].[PredictionNetworkBookingPax] SELECT * FROM [WKBRZ].[dbo].[PredictionNetworkBookingPax_Backup]

	-- Delete all prediction table contents
	#SettingWithCopyWarning:
	#A value is trying to be set on a copy of a slice from a DataFrame.
	#Try using .loc[row_indexer,col_indexer] = value instead

	# assign was introduced in pandas 0.16 to deal with this false positive

	# Instead of
	df_weekly_season.loc[:, 'market'] = market
	# or
	df_weekly_season['market'] = market
	import calendar
	df_weekly_season['DOW_ENG'] = weekly_season['ds'].apply(lambda x : calendar.day_name[pd.to_datetime(x).weekday()])
	df_weekly_season['DOW'] = weekly_season['ds'].apply(lambda x : int(pd.to_datetime(x).strftime('%w'))) # SUN == 0
	df_weekly_season['DOW'] = weekly_season['ds'].apply(lambda x : int(pd.to_datetime(x).weekday()) # MON == 0
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	import matplotlib.style as style

	style.use('ggplot')

	f, ax1 = plt.subplots(1, 1, figsize=(20, 7), sharex=True)
	df_mth_plot = pd.DataFrame(list(monthly_growth_dict.items()), columns=['Market', 'Average Monthly Growth Rate %'])
	df_mth_plot = df_mth_plot.sort_values('Average Monthly Growth Rate %', ascending=False)