Skip to content

Instantly share code, notes, and snippets.

View Wittline's full-sized avatar
:octocat:
Hi there

Ramses Alexander Coraspe Valdez Wittline

:octocat:
Hi there
View GitHub Profile
# DROP TABLES
users_table_drop = "DROP TABLE IF EXISTS users"
departments_table_drop = "DROP TABLE IF EXISTS departments"
companies_table_drop = "DROP TABLE IF EXISTS companies"
staging_table_drop = "DROP TABLE IF EXISTS staging"
# CREATE TABLES
staging_table_create = ("""
CREATE TABLE IF NOT EXISTS staging(
@Wittline
Wittline / docker-compose.yaml
Created June 5, 2022 16:11
docker-compose file with "wait-for-it"
version: '3.1'
services:
db:
container_name: pg_container
image: postgres
restart: always
environment:
POSTGRES_USER: "postgres"
POSTGRES_PASSWORD: "pg12345"
insert into {project}.{dataset}.{tablefinal}
(id, category, lastdate)
select id, category, lastdate
from {project}.{dataset}.{table1}
union all
select id, category, lastdate
from {project}.{dataset}.{table2}
insert into {project}.{dataset}.{tabledestination}
(id, category, lastdate)
select id, category, lastdate
from {project}.{dataset}.{tablesource}
WHERE EXTRACT(YEAR FROM lastdate) = {year} and category = '{category}'
import argparse
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json
class SparkTask:
def __init__(self, params):
self.params = params
CREATE OR REPLACE TABLE {project}.{dataset}.{table}
(
{columns}
)
;
CREATE OR REPLACE TABLE {project}.{dataset}.{table}
(
{columns}
)
PARTITION BY {partitioncolumn}
CLUSTER BY {clustercolumn}
;
from google.oauth2 import service_account
from google.cloud import dataproc_v1 as dataproc
from google.cloud import storage
import os
class dataproc_create_cluster:
def __init__(self):
self.__credentials = None
"tasks" : [
{
"task_id" : "startup_dataproc_1",
"script" : "gcs.project-pydag.iac_scripts.iac.dataproc_create_cluster",
"params" : "{'cluster_name':'cluster-dataproc-pydag-2022', 'project_name':'atomic-key-348214', 'region':'us-central1', '**GCP_service-account':''}",
"dependencies":[]
},
{
"task_id" : "initial_ingestion_1",
"script" : "gcs.project-pydag.module_name.spark.csv_gcs_to_bq",
import pandas as pd
import re
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import BM25
class MovieRecommender: