Skip to content

Instantly share code, notes, and snippets.

View lakshay-arora's full-sized avatar
🇮🇳

Lakshay lakshay-arora

🇮🇳
  • Gurugram
View GitHub Profile
### importing the required libraries
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
# add a sheet with 20 rows and 2 columns
sheet.add_worksheet(rows=20,cols=2,title='runs')
# get the instance of the second sheet
sheet_runs = sheet.get_worksheet(1)
# importing required libraries
import requests
from bs4 import BeautifulSoup
import os
import time
def get_path(url):
return "static/URL_" + str(url.replace("/","_"))
headers = {
# define stage 1: transform the column feature_2 to numeric
stage_1 = StringIndexer(inputCol= 'feature_2', outputCol= 'feature_2_index')
# define stage 2: transform the column feature_3 to numeric
stage_2 = StringIndexer(inputCol= 'feature_3', outputCol= 'feature_3_index')
# define stage 3: one hot encode the numeric versions of feature 2 and 3 generated from stage 1 and stage 2
stage_3 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), stage_2.getOutputCol()],
outputCols= ['feature_2_encoded', 'feature_3_encoded'])
# define stage 4: create a vector of all the features required to train the logistic regression model
stage_4 = VectorAssembler(inputCols=['feature_1', 'feature_2_encoded', 'feature_3_encoded', 'feature_4'],
outputCol='features')
# create a RDD of the text file with Number of Partitions = 4
my_text_file = sc.textFile('tokens_spark.txt',minPartitions=4)
# RDD Object
print(my_text_file)
# convert to lower case
my_text_file = my_text_file.map(lambda x : x.lower())
# Updated RDD Object
from collections import Counter
# define the python function
def my_function():
# get the variable value
file_path = Variable.get("data_path")
# open the file
file_ = open(file_path)
# read the file and calculate the word count
data = Counter(file_.read().split())
from datetime import timedelta
# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG
# Operators; we need this to operate!
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
spark.createDataFrame(
[
(1, 'Lakshay'), # create your data here, make sure to be consistent in the types.
(2, 'Aniruddha'),
.
.
.
.
(100, 'Siddhart')
],
# parallelizing data collection
my_list = [1, 2, 3, 4, 5]
my_list_rdd = sc.parallelize(my_list)
## 2. Referencing to external data file
file_rdd = sc.textFile("path_of_file")
# create weekly demand collection
database.create_collection("weekly_demand")