I hereby claim:
- I am pchng on github.
- I am pchng (https://keybase.io/pchng) on keybase.
- I have a public key whose fingerprint is F722 6230 DB84 09B3 FFB2 737B B1EB 75BC 98B5 23A8
To claim this, I am signing this object:
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <title>Test of JavaScript single-threadedness</title> | |
| </head> | |
| <body> | |
| <h1>Test of JavaScript single-threadedness</h1> | |
| <button id="check">Check</button> | |
| <script src="https://code.jquery.com/jquery-2.1.0.min.js"></script> |
| // Test whether Stream API operations are efficiently re-ordered. | |
| // LOGGER is instance of org.slf4j.Logger | |
| @Test | |
| public void testStreamApiOrdering() { | |
| final Random rnd = new Random(); | |
| final int nItems = 10_000; | |
| final List<Integer> values = new ArrayList<>(nItems); | |
| for (int i = 0; i < nItems; ++i) { | |
| values.add(rnd.nextInt(1000)); // Values between 0 and 999, inclusive. | |
| } |
| #!/usr/bin/env python | |
| # Python 2.7.x | |
| import argparse | |
| import csv | |
| import datetime | |
| import sys | |
| from collections import defaultdict | |
| # Requires requests and beautifulsoup4: pip install beautifulsoup4 requests | |
| import requests |
I hereby claim:
To claim this, I am signing this object:
| # Fish prompt to show username, host, CWD, Python VirtualEnv, VCS (git, svn, hg) info. | |
| function fish_prompt --description 'Write out the prompt' | |
| set -l last_status $status | |
| if not set -q __fish_git_prompt_show_informative_status | |
| set -g __fish_git_prompt_show_informative_status 1 | |
| end | |
| if not set -q __fish_git_prompt_hide_untrackedfiles | |
| set -g __fish_git_prompt_hide_untrackedfiles 1 | |
| end |
| #include <stdio.h> | |
| #define A 3000 | |
| #define B 4000 | |
| #define C 3000 | |
| // Computes out = left @ right, where `@` is matrix muliplication | |
| // Dimensions: | |
| // left: a x b | |
| // right: b x c |
| // Computes out = left @ right, where `@` is matrix muliplication | |
| // Dimensions: | |
| // left: a x b | |
| // right: b x c | |
| // out: a x c | |
| // monolithic kernel: One thread per output element in matrix C. | |
| // (Each thread computes the dot product between a row in `left` and a col in `right`) | |
| __global__ void matMul(float *left, float *right, float *out, int a, int b, int c) { | |
| // Use y to index to rows, x to index to cols (just to match typical visualization) | |
| // row indexes into left, col indexes into right. |
| // Same as above, but row/col set to x/y instead. | |
| __global__ void matMulBad(float *left, float *right, float *out, int a, int b, int c) { | |
| int row = blockIdx.x * blockDim.x + threadIdx.x; | |
| int col = blockIdx.y * blockDim.y + threadIdx.y; | |
| if (row < a && col < c) { | |
| float sum = 0.0; | |
| for (int i = 0; i < b; i++) { | |
| // 1. If row/threadIdx.x is changing within the warp, then on each iteration the threads do a strided access: | |
| // They will access elements separated by a stride of b. This results in non-coalesced accesses (multiple memory reads) |
| # This is the tiling approach in: https://courses.cs.washington.edu/courses/cse599m/23sp/notes/flashattn.pdf | |
| def flashatt_spec(q: Float32[Tensor, "200"], k: Float32[Tensor, "200"], v: Float32[Tensor, "200"]) -> Float32[Tensor, "200"]: | |
| x = q[:, None] * k[None, :] | |
| x_max = x.max(1, keepdim=True)[0] | |
| x = x - x_max | |
| x_exp = x.exp() | |
| soft = x_exp / x_exp.sum(1, keepdim=True) | |
| return (v[None, :] * soft).sum(1) | |
| @triton.jit |