Skip to content

Instantly share code, notes, and snippets.

@gxercavins
gxercavins / run_on_gcp.sh
Created February 8, 2019 20:30
Test for SO question 53404579
#!/bin/bash
if [ "$#" -ne 2 ]; then
echo "Please specify Project ID and GCS Bucket Name (without gs:// prefix)"
echo "Usage: ./run_on_gcp.sh project-id bucket-name"
exit
fi
export PROJECT=$1
export BUCKET=$2
@gxercavins
gxercavins / NonStandardDelimiters.java
Created February 17, 2019 21:35
Test for SO question 54638963
package com.dataflow.samples;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.Default;
package com.dataflow.samples;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.beam.sdk.Pipeline;
import argparse
import logging
import apache_beam as beam
import apache_beam.transforms.window as window
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import StandardOptions
WINDOW_SECONDS = 10
package com.dataflow.samples;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.joda.time.Duration;
import org.joda.time.Instant;
import java.util.concurrent.TimeUnit;
import java.util.Date;
import java.sql.Timestamp;
import org.apache.beam.sdk.Pipeline;
@gxercavins
gxercavins / gbk_sessions.py
Created April 1, 2019 12:03
SO question 55219481: use session windows with Python SDK
import argparse, json, logging, time
import apache_beam as beam
import apache_beam.transforms.window as window
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class DebugPrinter(beam.DoFn):
@gxercavins
gxercavins / more_sessions.py
Last active January 6, 2020 10:51
SO question 55261957 - Per-session statistics
import argparse, json, logging, time
import apache_beam as beam
import apache_beam.transforms.window as window
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class AnalyzeSession(beam.DoFn):
apache-beam[gcp]==2.9.0
inflect
@gxercavins
gxercavins / zip.py
Created April 10, 2019 14:54
SO question 55485228 approach 2
import argparse, logging, time
import inflect
import apache_beam as beam
import apache_beam.transforms.combiners as combine
from apache_beam.transforms.userstate import BagStateSpec
from apache_beam.coders import VarIntCoder
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
@gxercavins
gxercavins / DataflowSHA256.java
Last active April 21, 2019 20:16
SO question 55784467
package com.dataflow.samples;
import java.io.IOException;
import java.security.MessageDigest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.KV;