Skip to content

Instantly share code, notes, and snippets.

@gxercavins
gxercavins / dataflow3.py
Created October 30, 2019 18:20
SO question 58545759
import datetime, re, time
from airflow import models
from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator, GoogleCloudBucketHelper
from airflow.contrib.hooks.gcp_dataflow_hook import DataFlowHook
from airflow.models import BaseOperator
from typing import Dict, List
JOB_NAME='dataflow-python3'
@gxercavins
gxercavins / RecursiveGCS.java
Created October 28, 2019 19:46
SO question 58589470
package com.dataflow.samples;
import java.util.ArrayList;
import java.util.List;
import com.google.api.gax.paging.Page;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.Bucket;
import com.google.cloud.storage.BucketInfo;
import com.google.cloud.storage.Storage;
@gxercavins
gxercavins / CombineFnLogger.java
Created October 7, 2019 19:23
SO question 58268236
package org.apache.beam.examples;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.Combine;
import org.apache.beam.sdk.transforms.Combine.CombineFn;
@gxercavins
gxercavins / TextToPubsub.java
Created October 7, 2019 14:45
hard-code the use of private IPs for GCS to Pub/Sub template
/*
* Copyright (C) 2018 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@gxercavins
gxercavins / write2tables.py
Last active September 15, 2019 09:15
SO Question 57940102
import argparse, json, logging
import apache_beam as beam
from apache_beam.io.gcp import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
PROJECT="PROJECT_ID"
BQ_DATASET="DATASET_NAME"
@gxercavins
gxercavins / nlp.py
Last active August 14, 2019 16:09
SO question 57496400
import apache_beam as beam
import logging
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
PROJECT = 'PROJECT_ID'
@gxercavins
gxercavins / notifications.py
Created August 14, 2019 15:30
SO question 57494999
import argparse, json, logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class ExtractFn(beam.DoFn):
def process(self, element):
file_name = 'gs://' + "/".join(element['id'].split("/")[:-1])
@gxercavins
gxercavins / script.py
Created August 12, 2019 20:24
SO question 55370068
import argparse, datetime, logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class GetTimestampFn(beam.DoFn):
"""Prints element timestamp"""
def process(self, element, timestamp=beam.DoFn.TimestampParam):
@gxercavins
gxercavins / CacheHit.java
Created July 22, 2019 16:41
SO question 57141083
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryOptions;
import com.google.cloud.bigquery.FieldValueList;
import com.google.cloud.bigquery.Job;
import com.google.cloud.bigquery.JobId;
import com.google.cloud.bigquery.JobInfo;
import com.google.cloud.bigquery.JobStatistics;
import com.google.cloud.bigquery.JobStatistics.QueryStatistics;
import com.google.cloud.bigquery.QueryJobConfiguration;
import com.google.cloud.bigquery.QueryResponse;
@gxercavins
gxercavins / MultipleTopics.java
Created July 16, 2019 19:44
SO question 57053573
package com.dataflow.samples;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.PCollection;