Skip to content

Instantly share code, notes, and snippets.

View ishritam's full-sized avatar
🎯
Focusing

Shritam Kumar Mund ishritam

🎯
Focusing
View GitHub Profile
@ishritam
ishritam / zalando_etl.py
Last active June 20, 2021 12:10
Airflow DAG file For Zalando
import json
import requests
import pandas as pd
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime
default_args = {
'start_date': datetime(year=2021, month=6, day=20)
@ishritam
ishritam / load.py
Created June 20, 2021 11:25
Zalando ETL Load
def load_data(path: str, ti) -> None:
data = ti.xcom_pull(key='transformed_data', task_ids=['transform_data'])
data_df = pd.DataFrame(data[0])
data_df.to_csv(path, index=None)
@ishritam
ishritam / transform.py
Created June 20, 2021 11:22
ETL_zalando_transform
def transform_data(ti) -> None:
data = ti.xcom_pull(key='extracted_data', task_ids=['extract_data'])[0]
transformed_data = []
for item in data:
transformed_data.append({
'sku': item.get("sku", ""),
'Name': item['name'],
'Price': item['price'].get("original"),
'Brand Name': item['brand_name'],
'Thumbnail': f"https://img01.ztat.net/article/{item['media'][0]['path']}",
@ishritam
ishritam / extraction.py
Created June 20, 2021 11:10
Airflow Zalando Extraction
def extract_data(url: str, headers: str, ti) -> None:
res = requests.get(url, headers=headers)
json_data = json.loads(res.content)['articles']
ti.xcom_push(key='extracted_data', value=json_data)
# coding=UTF-8
import pickle
import nltk
from nltk.corpus import brown
#TextBlob FastNPExtractor + ConllExtractor
# Textblob
from textblob import TextBlob
from textblob.np_extractors import FastNPExtractor
from textblob.np_extractors import ConllExtractor
try:
from flask import app,Flask
from flask_restful import Resource, Api, reqparse
import elasticsearch
from elasticsearch import Elasticsearch
import datetime
import concurrent.futures
import requests
import json
except Exception as e:
from elasticsearch import helpers
res = helpers.bulk(es,gen(df),request_timeout= 300)
import uuid
def gen(df):
for i in df:
yield{
"_index" : "my_med",
"_type" : "_doc",
"_id" : uuid.uuid4(),
"_source" : {
#"name": i.get("name"),
"name":i.get("name"),
setting ={
"mappings" : {
"properties" : {
"Clean_Uses" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
import pandas as pd
df = pd.read_csv('clean_data.csv')
df = df.to_dict('records')