Skip to content

Instantly share code, notes, and snippets.

Avatar

Lakshay lakshay-arora

View GitHub Profile
View pipeline_4_pyspark.py
# define stage 1: transform the column feature_2 to numeric
stage_1 = StringIndexer(inputCol= 'feature_2', outputCol= 'feature_2_index')
# define stage 2: transform the column feature_3 to numeric
stage_2 = StringIndexer(inputCol= 'feature_3', outputCol= 'feature_3_index')
# define stage 3: one hot encode the numeric versions of feature 2 and 3 generated from stage 1 and stage 2
stage_3 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), stage_2.getOutputCol()],
outputCols= ['feature_2_encoded', 'feature_3_encoded'])
# define stage 4: create a vector of all the features required to train the logistic regression model
stage_4 = VectorAssembler(inputCols=['feature_1', 'feature_2_encoded', 'feature_3_encoded', 'feature_4'],
outputCol='features')
View lazy_2_1.py
# create a RDD of the text file with Number of Partitions = 4
my_text_file = sc.textFile('tokens_spark.txt',minPartitions=4)
# RDD Object
print(my_text_file)
# convert to lower case
my_text_file = my_text_file.map(lambda x : x.lower())
# Updated RDD Object
View add_sheet.py
# add a sheet with 20 rows and 2 columns
sheet.add_worksheet(rows=20,cols=2,title='runs')
# get the instance of the second sheet
sheet_runs = sheet.get_worksheet(1)
View define_function_af.py
from collections import Counter
# define the python function
def my_function():
# get the variable value
file_path = Variable.get("data_path")
# open the file
file_ = open(file_path)
# read the file and calculate the word count
data = Counter(file_.read().split())
View airflow_python_operator.py
### importing the required libraries
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
View airflow_import.py
from datetime import timedelta
# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG
# Operators; we need this to operate!
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
View dataframe.py
spark.createDataFrame(
[
(1, 'Lakshay'), # create your data here, make sure to be consistent in the types.
(2, 'Aniruddha'),
.
.
.
.
(100, 'Siddhart')
],
View rdd_23.py
# parallelizing data collection
my_list = [1, 2, 3, 4, 5]
my_list_rdd = sc.parallelize(my_list)
## 2. Referencing to external data file
file_rdd = sc.textFile("path_of_file")
View collection_1.py
# create weekly demand collection
database.create_collection("weekly_demand")
View 1_ag.py
result_1 = weekly_demand_collection.aggregate([
## stage 1
{
"$match" : {
"center_id" : {
"$eq" : 11
}
}
},
## stage 2