Skip to content

Instantly share code, notes, and snippets.

View vannguyen3007's full-sized avatar
๐Ÿˆ
Focusing

IMei vannguyen3007

๐Ÿˆ
Focusing
View GitHub Profile
@vannguyen3007
vannguyen3007 / gist:548682f55197771371c0457858c30c8e
Created September 18, 2020 10:39
Using S3 AWS to store Apache Airflow logs
[core]
# The folder where your airflow pipelines live, most likely a
# subfolder in a code repository
# This path must be absolute
dags_folder = /usr/local/airflow/dags
# The folder where airflow should store its log files
# This path must be absolute
base_log_folder = /usr/local/airflow/logs
@vannguyen3007
vannguyen3007 / upload_file_to_S3_dag.py
Last active September 18, 2020 12:54
upload_file_to_S3_dag.py
from airflow import DAG
from airflow.operators import DummyOperator, PythonOperator
default_args = {
'owner': 'arnaud',
'start_date': datetime(2019, 1, 1),
'retry_delay': timedelta(minutes=5)
}
# Using the context manager alllows you not to duplicate the dag parameter in each operator
with DAG('S3_dag_test', default_args=default_args, schedule_interval='@once') as dag:
@vannguyen3007
vannguyen3007 / S3_helpers.py
Created September 18, 2020 13:20
S3_helpers.py
import boto3
s3 = boto3.resource('s3')
def upload_file_to_S3(filename, key, bucket_name):
s3.Bucket(bucket_name).upload_file(filename, key)
@vannguyen3007
vannguyen3007 / upload_file_to_S3_dag.py
Created September 18, 2020 13:25
upload_file_to_S3_dag.py
upload_to_S3_task = PythonOperator(
task_id='upload_to_S3',
python_callable=upload_file_to_S3,
op_kwargs={
'filename': 'path/to/my_file.csv',
'key': 'my_S3_file.csv',
'bucket_name': 'my-S3-bucket',
},
dag=my_dag)
@vannguyen3007
vannguyen3007 / logs statements.py
Created September 20, 2020 10:46
Python script that logs statements
import logging
import random
logging.basicConfig(filename="logFile.txt",
filemode='a',
format='%(asctime)s %(levelname)s-%(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
for i in xrange(0,15):
x=random.randint(0,2)
if(x==0):
@vannguyen3007
vannguyen3007 / Config SQS.json
Created September 20, 2020 12:18
Configure an SQS queue
{
"Version": "2012-10-17",
"Id": "example-ID",
"Statement": [
{
"Sid": "example-statement-ID",
"Effect": "Allow",
"Principal": {
"AWS":"*"
},
@vannguyen3007
vannguyen3007 / logstash-simple.cfg
Last active September 20, 2020 12:56
Details of input, output file.
input{
file{
path => "full/path/to/log_file/location/logFile.txt"
start_position => "beginning"
}
}
filter
{
grok{
match => {"message" => "%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:log-level}-%{GREEDYDATA:message}"}
@vannguyen3007
vannguyen3007 / Import.py
Created October 2, 2020 10:46
Importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
import time
import math
import re
warnings.simplefilter('ignore')
from collections import Counter, defaultdict
@vannguyen3007
vannguyen3007 / scale_data.py
Created October 2, 2020 11:03
Scaling the data
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2), stop_words = "english", min_df = 3, max_features = 100000)
train_text_feature_tfidf = tfidf_vectorizer.fit_transform(TrainData['Text'])
train_text_feature_tfidf = StandardScaler(with_mean = False).fit_transform(train_text_feature_tfidf)
test_text_feature_tfidf = tfidf_vectorizer.transform(TestData['Text'])
test_text_feature_tfidf = StandardScaler(with_mean = False).fit_transform(test_text_feature_tfidf)
cv_text_feature_tfidf = tfidf_vectorizer.transform(CV_Data['Text'])
cv_text_feature_tfidf = StandardScaler(with_mean = False).fit_transform(cv_text_feature_tfidf)
@vannguyen3007
vannguyen3007 / Classifiers.py
Created October 2, 2020 11:16
Stack classifiers
clf_NB = MultinomialNB(alpha=10**-5)
clf_NB.fit(Train_X, Train_Y)
calib_clf_NB = CalibratedClassifierCV(clf_NB, method = "sigmoid")
calib_clf_NB.fit(Train_X, Train_Y)
print("Naive Bayes CV Log Loss: "+str(np.round(log_loss(CV_Y, calib_clf_NB.predict_proba(CV_X), labels=clf_NB.classes_), 4)))
clf_LR = SGDClassifier(loss = "log", alpha = 10, class_weight = "balanced")
clf_LR.fit(Train_X, Train_Y)
calib_clf_LR = CalibratedClassifierCV(clf_LR, method = "sigmoid")
calib_clf_LR.fit(Train_X, Train_Y)