Skip to content

Instantly share code, notes, and snippets.

View Wittline's full-sized avatar
:octocat:
Hi there

Ramses Alexander Coraspe Valdez Wittline

:octocat:
Hi there
View GitHub Profile
@Wittline
Wittline / data_receipts.py
Last active April 24, 2021 21:52
Scrapper for Uber receipts
from bs4 import BeautifulSoup
import re
import dateutil.parser as parser
from dateutil import tz
from datetime import datetime
import csv
import s3fs
import pickle
from airflow.hooks.base_hook import BaseHook
{
"steps":
[
{
"executor_memory": "18G",
"executor_cores": "4",
"description" : "Reading from dataset data and filtering",
"name": "step_0",
"guiid": "0",
"ActionOnFailure": "CANCEL_AND_WAIT",
{
"InstanceFleets": [
{
"Name": "fleetmaster",
"InstanceFleetType": "MASTER",
"TargetOnDemandCapacity": 1,
"InstanceTypeConfigs": [
{"InstanceType":"m5.xlarge"}
]
},
import pandas as pd
import re
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import BM25
class MovieRecommender:
@Wittline
Wittline / Uber_tracking_expenses.py
Last active September 14, 2021 10:21
Uber_tracking_expenses DAG
import logging
import datetime
from airflow import DAG
from airflow.models import Variable
from airflow.models.connection import Connection
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.hooks.postgres_hook import PostgresHook
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.postgres_operator import PostgresOperator
CREATE OR REPLACE TABLE {project}.{dataset}.{table}
(
{columns}
)
;
import argparse
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json
class SparkTask:
def __init__(self, params):
self.params = params
insert into {project}.{dataset}.{tabledestination}
(id, category, lastdate)
select id, category, lastdate
from {project}.{dataset}.{tablesource}
WHERE EXTRACT(YEAR FROM lastdate) = {year} and category = '{category}'
insert into {project}.{dataset}.{tablefinal}
(id, category, lastdate)
select id, category, lastdate
from {project}.{dataset}.{table1}
union all
select id, category, lastdate
from {project}.{dataset}.{table2}
CREATE OR REPLACE TABLE {project}.{dataset}.{table}
(
{columns}
)
PARTITION BY {partitioncolumn}
CLUSTER BY {clustercolumn}
;