peter pezon

## transformer.py
"""
PySpark tokenizer transformer

PySpark transformer that supports format-preserving encryption using the FFX1 algorithm (from pyffx library)

requirements:
  pyffx
  pyspark
"""
from string import digits, ascii_uppercase, ascii_lowercase

## create_us_counties_network.py
import json

import networkx as nx
import requests
from shapely.geometry import shape


def fetch_geojson():
    """Fetches GeoJSON for U.S. Counties.
    Example feature: {'GEO_ID': '0400000US23', 'STATE': '23', 'NAME': 'Maine',

## __main__.py
"""
Fetch song lyrics
Fetches song lyrics from Genius API based on a CSV with a list of songs.
Outputs song lyrics in a separate CSV.
CSVs are set through command line parameters.
Uses asynchronous routines to fetch song lyrics to fetch many lyrics at a time.

Requires Python 3.6+

Install:

## LowerCaseAlphabet.scala
package com.example.tokenizers.alphabet

class LowerCaseAlphabet extends TokenizerAlphabet {
  val CHARACTERS = Array[Char](
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z')
}

## null_transformer.scala
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

object NullTransformer {
  val nullStringColumnUDF = udf(() => None: Option[String])
  val nullLongColumnUDF = udf(() => None: Option[Long])
  val nullIntegerColumnUDF = udf(() => None: Option[Integer])
  val nullFloatColumnUDF = udf(() => None: Option[Float])
  val nullDoubleColumnUDF = udf(() => None: Option[Double])

## 01_description.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pezon
                / 01_description.md
            
            
              Last active
              May 3, 2019 15:58
            
              
                Drop managed tables from Spark cluster
              
          
    Maintenance - Drop managed tables

This notebook will clean up the table space by dropping all managed tables, including the metadata and underlying data. This process is necessary when updating table schemas (e.g., changing data types).
Note: It is advised to backup your data first.
Recommended procedure


Back-up data tables in ADLS (not implemented here)
Run maintenance - drop managed tables notebook (this notebook)
Re-import all data.


## gist:b2624643b3554eba18bb7d9eca223db6
from collections import defaultdict


class Collection(object):
    def __init__(self, items={}, id='id'):
        self.id = id
        self.items = defaultdict(dict)
        self.update(items)

    def __getitem__(self, key):

## README.md

      
              3 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pezon
                / README.md
            
            
              Last active
              May 3, 2019 16:01
            
              
                scrape namecheap domains to a spreadsheet
              
          
    LAZY AF

Step 1. Load console-save.js
Step 2. Load namecheap-cart-scraper.js
Step 3. Convert with https://json-csv.com/
Step 4. Load to Google Spreadsheet


## README.md

      
              3 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pezon
                / README.md
            
            
              Last active
              May 3, 2019 16:00
            
              
                scrape namecheap domains to a spreadsheet
              
          
    LAZY AF

Step 1. Load console-save.js
Step 2. Load namecheap-cart-scraper.js
Step 3. Convert with https://json-csv.com/
Step 4. Load to Google Spreadsheet


## json-to-table.py
"""
Proof of concept: json-to-table

Convert JSON object to tabular data using ijson,
which provides a SAX-like iterable for JSON objects

"""

from urlparse import urlparse
from collections import Counter, OrderedDict
	"""
	PySpark tokenizer transformer

	PySpark transformer that supports format-preserving encryption using the FFX1 algorithm (from pyffx library)

	requirements:
	pyffx
	pyspark
	"""
	from string import digits, ascii_uppercase, ascii_lowercase
	import json

	import networkx as nx
	import requests
	from shapely.geometry import shape


	def fetch_geojson():
	"""Fetches GeoJSON for U.S. Counties.
	Example feature: {'GEO_ID': '0400000US23', 'STATE': '23', 'NAME': 'Maine',
	"""
	Fetch song lyrics
	Fetches song lyrics from Genius API based on a CSV with a list of songs.
	Outputs song lyrics in a separate CSV.
	CSVs are set through command line parameters.
	Uses asynchronous routines to fetch song lyrics to fetch many lyrics at a time.

	Requires Python 3.6+

	Install:
	package com.example.tokenizers.alphabet

	class LowerCaseAlphabet extends TokenizerAlphabet {
	val CHARACTERS = Array[Char](
	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
	'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z')
	}
	import org.apache.spark.sql._
	import org.apache.spark.sql.functions._
	import org.apache.spark.sql.types._

	object NullTransformer {
	val nullStringColumnUDF = udf(() => None: Option[String])
	val nullLongColumnUDF = udf(() => None: Option[Long])
	val nullIntegerColumnUDF = udf(() => None: Option[Integer])
	val nullFloatColumnUDF = udf(() => None: Option[Float])
	val nullDoubleColumnUDF = udf(() => None: Option[Double])
	from collections import defaultdict


	class Collection(object):
	def __init__(self, items={}, id='id'):
	self.id = id
	self.items = defaultdict(dict)
	self.update(items)

	def __getitem__(self, key):
	"""
	Proof of concept: json-to-table

	Convert JSON object to tabular data using ijson,
	which provides a SAX-like iterable for JSON objects

	"""

	from urlparse import urlparse
	from collections import Counter, OrderedDict