Skip to content

Instantly share code, notes, and snippets.

View manuzhang's full-sized avatar
🧒
Working from home

Manu Zhang manuzhang

🧒
Working from home
View GitHub Profile
@manuzhang
manuzhang / spark_limit_bucket.md
Last active March 15, 2022 12:08
Configs for taking limit rows from bucketed table
Name Default Meaning Since Version
spark.sql.files.maxPartitionBytes 128 MB The maximum number of bytes to pack into a single partition when reading files 2.0.0
spark.sql.sources.bucketing.enabled true When false, we will treat bucketed table as normal table 2.0.0
spark.sql.sources.bucketing.autoBucketedScan.enabled true Whe true, decide whether to do bucketed scan on input tables based on query plan automatically 3.1.1
spark.sql.limit.scaleUpFactor 4 Minimal increase rate in number of partitions between attempts when executing a take on a query 2.1.1
package org.apache.spark
import java.io.{BufferedInputStream, PrintWriter}
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{LocalFileSystem, Path}
import org.apache.spark.io.CompressionCodec
import scala.io.Source
class AdaptiveQueryExecSuite
extends QueryTest
with SharedSparkSession
with AdaptiveSparkPlanHelper {
test("Empty stage") {
withSQLConf(
SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
SQLConf.COALESCE_PARTITIONS_ENABLED.key -> "true") {
withTempView("v1", "v2") {
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
args: [
'--proxy-server=http://127.0.0.1:1087'
]
});
for (const url of process.argv.slice(2)) {
import $ivy.`org.seleniumhq.selenium:selenium-chrome-driver:3.0.1`
import org.openqa.selenium.JavascriptExecutor
import org.openqa.selenium.chrome.{ChromeDriver, ChromeOptions}
System.setProperty("webdriver.chrome.driver", "/Users/doriadong/bin/chromedriver")
val options = new ChromeOptions()
options.addArguments("--proxy-server=http://127.0.0.1:1087")
val driver = new ChromeDriver(options)
driver.get("https://overcast.fm/+E6UAAGfPU")
@manuzhang
manuzhang / opml.sc
Last active July 29, 2019 23:16
Analyzing my overcast data
import java.time.ZonedDateTime
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit
import $ivy.`com.lihaoyi::requests:0.1.8`
import $ivy.`org.scala-lang.modules::scala-xml:1.2.0`
import $ivy.`org.seleniumhq.selenium:selenium-chrome-driver:3.0.1`
import org.openqa.selenium.JavascriptExecutor
import org.openqa.selenium.chrome.{ChromeDriver, ChromeOptions}
import requests.TimeoutException
@manuzhang
manuzhang / GetReadme.scala
Created June 28, 2019 00:30
Get Github README for top starred repositories of my favorite languages
package io.github.manuzhang
import scala.concurrent.duration.Duration
import scala.concurrent.{Await, ExecutionContext, Future, blocking}
import scala.util.{Failure, Success, Try}
object GetReadme extends App with GitHub {
val languages = List(
import base64
from bs4 import BeautifulSoup
import json
import mistune
import re
def load_results(fileName):
f = open(fileName, "r+")
return json.loads(f.read())
#!/usr/bin/python3.6
from jupyter_client import BlockingKernelClient
from jupyter_client import KernelManager
import codecs
import nbformat
nb = nbformat.read(codecs.open("./empty_cell.ipynb",'r',encoding='utf-8'),as_version=4)
km = KernelManager(kernel_name=nb.metadata.get('kernelspec', {}).get('name', 'python'))
@manuzhang
manuzhang / spark-monitor.py
Created March 5, 2019 07:03
Monitor CPU and memory usage from Spark master UI
# coding: utf-8
from bs4 import BeautifulSoup
import requests
page = requests.get("http://spark-master.com").content
soup = BeautifulSoup(page, 'html.parser')
cores_text = soup.find('strong', string='Cores in use:').next_sibling
cores_parts = cores_text.strip().split('\n')