Manu Zhang manuzhang

## spark_limit_bucket.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                manuzhang
                / spark_limit_bucket.md
            
            
              Last active
              March 15, 2022 12:08
            
              
                Configs for taking limit rows from bucketed table
              
          
Name
Default
Meaning
Since Version


spark.sql.files.maxPartitionBytes
128 MB
The maximum number of bytes to pack into a single partition when reading files
2.0.0


spark.sql.sources.bucketing.enabled
true
When false, we will treat bucketed table as normal table
2.0.0


spark.sql.sources.bucketing.autoBucketedScan.enabled
true
Whe true, decide whether to do bucketed scan on input tables based on query plan automatically
3.1.1


spark.sql.limit.scaleUpFactor
4
Minimal increase rate in number of partitions between attempts when executing a take on a query
2.1.1


## Lz4Decompressor.scala
package org.apache.spark

import java.io.{BufferedInputStream, PrintWriter}
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{LocalFileSystem, Path}
import org.apache.spark.io.CompressionCodec

import scala.io.Source

## AdaptiveQueryExecSuite.scala
class AdaptiveQueryExecSuite
  extends QueryTest
  with SharedSparkSession
  with AdaptiveSparkPlanHelper {

  test("Empty stage") {
    withSQLConf(
      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
      SQLConf.COALESCE_PARTITIONS_ENABLED.key -> "true") {
      withTempView("v1", "v2") {

## get_audio_duration.js
const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch({
    args: [
      '--proxy-server=http://127.0.0.1:1087'
    ]
  });

  for (const url of process.argv.slice(2)) {

## get_audio_duration.sc
import $ivy.`org.seleniumhq.selenium:selenium-chrome-driver:3.0.1`
import org.openqa.selenium.JavascriptExecutor
import org.openqa.selenium.chrome.{ChromeDriver, ChromeOptions}

System.setProperty("webdriver.chrome.driver", "/Users/doriadong/bin/chromedriver")
val options = new ChromeOptions()
options.addArguments("--proxy-server=http://127.0.0.1:1087")
val driver = new ChromeDriver(options)

driver.get("https://overcast.fm/+E6UAAGfPU")

## opml.sc
import java.time.ZonedDateTime
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit

import $ivy.`com.lihaoyi::requests:0.1.8`
import $ivy.`org.scala-lang.modules::scala-xml:1.2.0`
import $ivy.`org.seleniumhq.selenium:selenium-chrome-driver:3.0.1`
import org.openqa.selenium.JavascriptExecutor
import org.openqa.selenium.chrome.{ChromeDriver, ChromeOptions}
import requests.TimeoutException

## GetReadme.scala
package io.github.manuzhang

import scala.concurrent.duration.Duration
import scala.concurrent.{Await, ExecutionContext, Future, blocking}
import scala.util.{Failure, Success, Try}


object GetReadme extends App with GitHub {

  val languages = List(

## extract_abstracts_from_readme.py
import base64
from bs4 import BeautifulSoup
import json
import mistune
import re

def load_results(fileName):
    f = open(fileName, "r+")
    return json.loads(f.read())

## execute_toree_notebook.py
#!/usr/bin/python3.6
from jupyter_client import BlockingKernelClient
from jupyter_client import KernelManager
import codecs
import nbformat


nb = nbformat.read(codecs.open("./empty_cell.ipynb",'r',encoding='utf-8'),as_version=4)

km = KernelManager(kernel_name=nb.metadata.get('kernelspec', {}).get('name', 'python'))

## spark-monitor.py
# coding: utf-8

from bs4 import BeautifulSoup
import requests

page = requests.get("http://spark-master.com").content
soup = BeautifulSoup(page, 'html.parser')

cores_text = soup.find('strong', string='Cores in use:').next_sibling
cores_parts = cores_text.strip().split('\n')
Name	Default	Meaning	Since Version
spark.sql.files.maxPartitionBytes	128 MB	The maximum number of bytes to pack into a single partition when reading files	2.0.0
spark.sql.sources.bucketing.enabled	true	When false, we will treat bucketed table as normal table	2.0.0
spark.sql.sources.bucketing.autoBucketedScan.enabled	true	Whe true, decide whether to do bucketed scan on input tables based on query plan automatically	3.1.1
spark.sql.limit.scaleUpFactor	4	Minimal increase rate in number of partitions between attempts when executing a take on a query	2.1.1
	package org.apache.spark

	import java.io.{BufferedInputStream, PrintWriter}
	import java.net.URI

	import org.apache.hadoop.conf.Configuration
	import org.apache.hadoop.fs.{LocalFileSystem, Path}
	import org.apache.spark.io.CompressionCodec

	import scala.io.Source
	class AdaptiveQueryExecSuite
	extends QueryTest
	with SharedSparkSession
	with AdaptiveSparkPlanHelper {

	test("Empty stage") {
	withSQLConf(
	SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
	SQLConf.COALESCE_PARTITIONS_ENABLED.key -> "true") {
	withTempView("v1", "v2") {
	const puppeteer = require('puppeteer');

	(async () => {
	const browser = await puppeteer.launch({
	args: [
	'--proxy-server=http://127.0.0.1:1087'
	]
	});

	for (const url of process.argv.slice(2)) {
	import $ivy.`org.seleniumhq.selenium:selenium-chrome-driver:3.0.1`
	import org.openqa.selenium.JavascriptExecutor
	import org.openqa.selenium.chrome.{ChromeDriver, ChromeOptions}

	System.setProperty("webdriver.chrome.driver", "/Users/doriadong/bin/chromedriver")
	val options = new ChromeOptions()
	options.addArguments("--proxy-server=http://127.0.0.1:1087")
	val driver = new ChromeDriver(options)

	driver.get("https://overcast.fm/+E6UAAGfPU")
	import java.time.ZonedDateTime
	import java.time.format.DateTimeFormatter
	import java.time.temporal.ChronoUnit

	import $ivy.`com.lihaoyi::requests:0.1.8`
	import $ivy.`org.scala-lang.modules::scala-xml:1.2.0`
	import $ivy.`org.seleniumhq.selenium:selenium-chrome-driver:3.0.1`
	import org.openqa.selenium.JavascriptExecutor
	import org.openqa.selenium.chrome.{ChromeDriver, ChromeOptions}
	import requests.TimeoutException
	package io.github.manuzhang

	import scala.concurrent.duration.Duration
	import scala.concurrent.{Await, ExecutionContext, Future, blocking}
	import scala.util.{Failure, Success, Try}


	object GetReadme extends App with GitHub {

	val languages = List(
	import base64
	from bs4 import BeautifulSoup
	import json
	import mistune
	import re

	def load_results(fileName):
	f = open(fileName, "r+")
	return json.loads(f.read())
	#!/usr/bin/python3.6
	from jupyter_client import BlockingKernelClient
	from jupyter_client import KernelManager
	import codecs
	import nbformat


	nb = nbformat.read(codecs.open("./empty_cell.ipynb",'r',encoding='utf-8'),as_version=4)

	km = KernelManager(kernel_name=nb.metadata.get('kernelspec', {}).get('name', 'python'))
	# coding: utf-8

	from bs4 import BeautifulSoup
	import requests

	page = requests.get("http://spark-master.com").content
	soup = BeautifulSoup(page, 'html.parser')

	cores_text = soup.find('strong', string='Cores in use:').next_sibling
	cores_parts = cores_text.strip().split('\n')