Aravind Yarram yaravind

## parse_s3_access_logs.sql

/*
  Usage: you'll want to search for the strings <bucket> and <prefix>, and insert the S3 bucket where your access
  logs are being delivered. Use (or delete) <prefix> to filter to a subset of your logs.
*/


/*
  These commented out configuration settings you can either run  yourself in the REPL and source this file using
  `.read parse_s3_access_logs.sql`, or you can uncomment them and supply values for yourself.

## download_adls_directory.py
# coding: utf-8
import os
from azure.storage.blob import BlobServiceClient

class DownloadADLS:
  def __init__(self, connection_string, container_name):
    service_client = BlobServiceClient.from_connection_string(connection_string)
    self.client = service_client.get_container_client(container_name)


## xsd-schema.scala
// need to add the Apache WS XMLSchema library to spark/jars (does not have dependencies)
// https://repo1.maven.org/maven2/org/apache/ws/xmlschema/xmlschema-core/2.2.5/xmlschema-core-2.2.5.jar

import org.apache.ws.commons.schema.XmlSchemaCollection
import java.io.StringReader
import scala.collection.JavaConverters._
import org.apache.ws.commons.schema._
import org.apache.ws.commons.schema.constants.Constants
import org.apache.spark.sql.types._

## generator.py
import pandas as pd
from tqdm import tqdm
import csv
import random
import string
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

random.seed(1999)

## vs-code-for-scala.md

      
              1 file
            
          
              5 forks
            
          
              0 comments
            
          
              11 stars
            
          
                olofwalker
                / vs-code-for-scala.md
            
            
              Last active
              October 9, 2022 21:55
            
              
                Some notes on VS Code for Scala programmers
              
          
    VS Code Plugins and configuration tips

https://code.visualstudio.com/docs/setup/setup-overview
Plugins


Project manager
https://marketplace.visualstudio.com/items?itemName=alefragnani.project-manager
Scala metals
https://marketplace.visualstudio.com/items?itemName=scalameta.metals


## fpmax.scala
package fpmax

import scala.util.Try
import scala.io.StdIn.readLine

object App0 {
  def main: Unit = {
    println("What is your name?")

    val name = readLine()

## Introduction to NLP with Python.ipynb

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              5 stars
            
          
                ParthaSSatpathy
                / Introduction to NLP with Python.ipynb
            
            
              Last active
              April 11, 2022 23:36
            
              
                Introduction to Natural Language Processing Using Python
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## SparkSQLJira.scala
package com.databricks.spark.jira

import scala.io.Source

import org.apache.spark.rdd.RDD

import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.sources.{TableScan, BaseRelation, RelationProvider}

## spark_tips_and_tricks.md

      
              1 file
            
          
              20 forks
            
          
              1 comment
            
          
              74 stars
            
          
                dusenberrymw
                / spark_tips_and_tricks.md
            
            
              Last active
              February 8, 2023 05:11
            
              
                Tips and tricks for Apache Spark.
              
          
    Spark Tips & Tricks

Misc. Tips & Tricks


If values are integers in [0, 255], Parquet will automatically compress to use 1 byte unsigned integers, thus decreasing the size of saved DataFrame by a factor of 8.
Partition DataFrames to have evenly-distributed, ~128MB partition sizes (empirical finding).  Always err on the higher side w.r.t. number of partitions.
Pay particular attention to the number of partitions when using flatMap, especially if the following operation will result in high memory usage. The flatMap op usually results in a DataFrame with a [much] larger number of rows, yet the number of partitions will remain the same. Thus, if a subsequent op causes a large expansion of memory usage (i.e. converting a DataFrame of indices to a DataFrame of large Vectors), the memory usage per partition may become too high. In this case, it is beneficial to repartition the output of flatMap to a number of partitions that will safely allow for appropriate partition memory sizes, based upon the


## win10-dev.md

      
              1 file
            
          
              13 forks
            
          
              5 comments
            
          
              121 stars
            
          
                wsargent
                / win10-dev.md
            
            
              Last active
              March 21, 2024 04:27
            
              
                Windows Development Environment for Scala
              
          
    Windows 10 Development Environment for Scala

This is a guide for Scala and Java development on Windows, using Windows Subsystem for Linux, although a bunch of it is applicable to a VirtualBox / Vagrant / Docker subsystem environment.  This is not complete, but is intended to be as step by step as possible.
Harden Windows 10

Read the entire Decent Security guide, and follow the instructions, especially:

Set UAC to full
Enable Drive Encryption

	/*
	Usage: you'll want to search for the strings <bucket> and <prefix>, and insert the S3 bucket where your access
	logs are being delivered. Use (or delete) <prefix> to filter to a subset of your logs.
	*/


	/*
	These commented out configuration settings you can either run yourself in the REPL and source this file using
	`.read parse_s3_access_logs.sql`, or you can uncomment them and supply values for yourself.
	# coding: utf-8
	import os
	from azure.storage.blob import BlobServiceClient

	class DownloadADLS:
	def __init__(self, connection_string, container_name):
	service_client = BlobServiceClient.from_connection_string(connection_string)
	self.client = service_client.get_container_client(container_name)
	// need to add the Apache WS XMLSchema library to spark/jars (does not have dependencies)
	// https://repo1.maven.org/maven2/org/apache/ws/xmlschema/xmlschema-core/2.2.5/xmlschema-core-2.2.5.jar

	import org.apache.ws.commons.schema.XmlSchemaCollection
	import java.io.StringReader
	import scala.collection.JavaConverters._
	import org.apache.ws.commons.schema._
	import org.apache.ws.commons.schema.constants.Constants
	import org.apache.spark.sql.types._
	import pandas as pd
	from tqdm import tqdm
	import csv
	import random
	import string
	from pyspark.sql import SparkSession
	from pyspark.sql.functions import *

	random.seed(1999)
	package fpmax

	import scala.util.Try
	import scala.io.StdIn.readLine

	object App0 {
	def main: Unit = {
	println("What is your name?")

	val name = readLine()
	package com.databricks.spark.jira

	import scala.io.Source

	import org.apache.spark.rdd.RDD

	import org.apache.spark.sql._
	import org.apache.spark.sql.functions._
	import org.apache.spark.sql.sources.{TableScan, BaseRelation, RelationProvider}