Pradeep Singh mepsrajput

## Dockerfile
# This basically installs some dependencies, adds two SQL scripts and runs a provided SH script.

FROM apache/airflow:2.0.0-python3.7
USER root
# INSTALL TOOLS
RUN apt-get update \
&& apt-get -y install libaio-dev \
&& apt-get install postgresql-client
RUN mkdir extra
USER airflow

## sas-proc-means.sas
title "Simple proc means";

/* Simple proc means  */
PROC MEANS DATA=SASHELP.CARS;
RUN;

title "Select the required variables & drop the labels";
/* Select the variables & drop the labels  */
PROC MEANS DATA=SASHELP.CARS nolabels;
var

## multiple-freq.py
from IPython.display import display

def multiFreq(dataset, variable_list):
    for i in variable_list:
        datax = dataset[f'{i}'].value_counts()
        datay = pd.DataFrame({
            f'{i}': datax.index,
            'Frequency': datax.values,
            'Percent': ((datax.values/datax.values.sum())*100).round(2),
            'Cumulative Frequency': datax.values.cumsum(),

## proc-freq-multiple.sas
/* freq procedure with multiple variables */
proc freq data=hgrosser;
	tables GENRE MOVIE;
run;

## proc_freq_with_missing.py
datax = data['GENRE'].value_counts(dropna=False)
datay = pd.DataFrame({
    'GENRE': datax.index,
    'Frequency': datax.values,
    'Percent': ((datax.values/datax.values.sum())*100).round(2),
    'Cumulative Frequency': datax.values.cumsum(),
    'Cumulative Percent': ((datax.values.cumsum()/datax.values.sum())*100).round(2)
})

datay

## freq_procedure_with_missing.sas
/* freq procedure with missing */
proc freq data=Gov_C_SAS;
	tables GENRE / missing;
run;

## simple_freq_procedure.sas
/* Import the CSV */

FILENAME Gov_C "/folders/myfolders/Assignments/governors_county.csv";

PROC IMPORT DATAFILE=Gov_C DBMS=CSV OUT=WORK.Gov_C_SAS;
	GETNAMES=YES;
RUN;

/* freq procedure */
proc freq data=Gov_C_SAS;

## freq_crosstab.py
datab = pd.crosstab(data.county, data.state, margins=True, margins_name='Total')

datab

## freq_cross_tab.sas
proc freq data=Gov_C_SAS;
  tables county*state / norow nocol nopercent;
run;

## proc_freq_options.py
datax = data['state'].value_counts().sort_index()
datay = pd.DataFrame({
        'state': datax.index,
        'Frequency': datax.values
})

datay
	# This basically installs some dependencies, adds two SQL scripts and runs a provided SH script.

	FROM apache/airflow:2.0.0-python3.7
	USER root
	# INSTALL TOOLS
	RUN apt-get update \
	&& apt-get -y install libaio-dev \
	&& apt-get install postgresql-client
	RUN mkdir extra
	USER airflow
	title "Simple proc means";

	/* Simple proc means */
	PROC MEANS DATA=SASHELP.CARS;
	RUN;

	title "Select the required variables & drop the labels";
	/* Select the variables & drop the labels */
	PROC MEANS DATA=SASHELP.CARS nolabels;
	var
	from IPython.display import display

	def multiFreq(dataset, variable_list):
	for i in variable_list:
	datax = dataset[f'{i}'].value_counts()
	datay = pd.DataFrame({
	f'{i}': datax.index,
	'Frequency': datax.values,
	'Percent': ((datax.values/datax.values.sum())*100).round(2),
	'Cumulative Frequency': datax.values.cumsum(),
	/* freq procedure with multiple variables */
	proc freq data=hgrosser;
	tables GENRE MOVIE;
	run;
	datax = data['GENRE'].value_counts(dropna=False)
	datay = pd.DataFrame({
	'GENRE': datax.index,
	'Frequency': datax.values,
	'Percent': ((datax.values/datax.values.sum())*100).round(2),
	'Cumulative Frequency': datax.values.cumsum(),
	'Cumulative Percent': ((datax.values.cumsum()/datax.values.sum())*100).round(2)
	})

	datay
	/* freq procedure with missing */
	proc freq data=Gov_C_SAS;
	tables GENRE / missing;
	run;
	/* Import the CSV */

	FILENAME Gov_C "/folders/myfolders/Assignments/governors_county.csv";

	PROC IMPORT DATAFILE=Gov_C DBMS=CSV OUT=WORK.Gov_C_SAS;
	GETNAMES=YES;
	RUN;

	/* freq procedure */
	proc freq data=Gov_C_SAS;
	datab = pd.crosstab(data.county, data.state, margins=True, margins_name='Total')

	datab
	proc freq data=Gov_C_SAS;
	tables county*state / norow nocol nopercent;
	run;
	datax = data['state'].value_counts().sort_index()
	datay = pd.DataFrame({
	'state': datax.index,
	'Frequency': datax.values
	})

	datay