I hereby claim:
- I am pchng on github.
- I am pchng (https://keybase.io/pchng) on keybase.
- I have a public key whose fingerprint is F722 6230 DB84 09B3 FFB2 737B B1EB 75BC 98B5 23A8
To claim this, I am signing this object:
# This is the tiling approach in: https://courses.cs.washington.edu/courses/cse599m/23sp/notes/flashattn.pdf | |
def flashatt_spec(q: Float32[Tensor, "200"], k: Float32[Tensor, "200"], v: Float32[Tensor, "200"]) -> Float32[Tensor, "200"]: | |
x = q[:, None] * k[None, :] | |
x_max = x.max(1, keepdim=True)[0] | |
x = x - x_max | |
x_exp = x.exp() | |
soft = x_exp / x_exp.sum(1, keepdim=True) | |
return (v[None, :] * soft).sum(1) | |
@triton.jit |
// Same as above, but row/col set to x/y instead. | |
__global__ void matMulBad(float *left, float *right, float *out, int a, int b, int c) { | |
int row = blockIdx.x * blockDim.x + threadIdx.x; | |
int col = blockIdx.y * blockDim.y + threadIdx.y; | |
if (row < a && col < c) { | |
float sum = 0.0; | |
for (int i = 0; i < b; i++) { | |
// 1. If row/threadIdx.x is changing within the warp, then on each iteration the threads do a strided access: | |
// They will access elements separated by a stride of b. This results in non-coalesced accesses (multiple memory reads) |
// Computes out = left @ right, where `@` is matrix muliplication | |
// Dimensions: | |
// left: a x b | |
// right: b x c | |
// out: a x c | |
// monolithic kernel: One thread per output element in matrix C. | |
// (Each thread computes the dot product between a row in `left` and a col in `right`) | |
__global__ void matMul(float *left, float *right, float *out, int a, int b, int c) { | |
// Use y to index to rows, x to index to cols (just to match typical visualization) | |
// row indexes into left, col indexes into right. |
#include <stdio.h> | |
#define A 3000 | |
#define B 4000 | |
#define C 3000 | |
// Computes out = left @ right, where `@` is matrix muliplication | |
// Dimensions: | |
// left: a x b | |
// right: b x c |
# Fish prompt to show username, host, CWD, Python VirtualEnv, VCS (git, svn, hg) info. | |
function fish_prompt --description 'Write out the prompt' | |
set -l last_status $status | |
if not set -q __fish_git_prompt_show_informative_status | |
set -g __fish_git_prompt_show_informative_status 1 | |
end | |
if not set -q __fish_git_prompt_hide_untrackedfiles | |
set -g __fish_git_prompt_hide_untrackedfiles 1 | |
end |
I hereby claim:
To claim this, I am signing this object:
#!/usr/bin/env python | |
# Python 2.7.x | |
import argparse | |
import csv | |
import datetime | |
import sys | |
from collections import defaultdict | |
# Requires requests and beautifulsoup4: pip install beautifulsoup4 requests | |
import requests |
// Test whether Stream API operations are efficiently re-ordered. | |
// LOGGER is instance of org.slf4j.Logger | |
@Test | |
public void testStreamApiOrdering() { | |
final Random rnd = new Random(); | |
final int nItems = 10_000; | |
final List<Integer> values = new ArrayList<>(nItems); | |
for (int i = 0; i < nItems; ++i) { | |
values.add(rnd.nextInt(1000)); // Values between 0 and 999, inclusive. | |
} |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="utf-8"> | |
<title>Test of JavaScript single-threadedness</title> | |
</head> | |
<body> | |
<h1>Test of JavaScript single-threadedness</h1> | |
<button id="check">Check</button> | |
<script src="https://code.jquery.com/jquery-2.1.0.min.js"></script> |