Skip to content

Instantly share code, notes, and snippets.

View lemire's full-sized avatar
🚀
working hard and fast

Daniel Lemire lemire

🚀
working hard and fast
View GitHub Profile
@lemire
lemire / base64_runtime.cpp
Created April 2, 2024 18:37
base64 runtime functions (simdutf)
// on success: returns a non-negative integer indicating the size of the
// binary produced, it most be no larger than 2147483647 bytes.
// In case of error, a negativ value is returned:
// * -2 indicates an invalid character,
// * -1 indicates a single character remained,
// * -3 indicates a possible overflow (i.e., more than 2 GB output).
@lemire
lemire / bench.sh
Created April 2, 2024 04:18
benchmark node base64
echo "base64 decode"
node benchmark/buffers/buffer-base64-decode.js
./out/Release/node benchmark/buffers/buffer-base64-decode.js
#echo "base64 encode"
#node benchmark/buffers/buffer-base64-encode.js
#./out/Release/node benchmark/buffers/buffer-base64-encode.js
echo "base64url decode"
node benchmark/buffers/buffer-base64url-decode.js
@lemire
lemire / sse.cs
Created April 1, 2024 13:31
utf8_validation.cs
public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength)
{
int processedLength = 0;
if (pInputBuffer == null || inputLength <= 0)
{
return pInputBuffer;
}
if (inputLength > 128)
@lemire
lemire / validateutf8.cs
Created March 19, 2024 15:13
core utf-8 validation algorithm in C#
Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
TOO_SHORT | OVERLONG_2,
TOO_SHORT,
TOO_SHORT | OVERLONG_3 | SURROGATE,
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
// See https://twitter.com/pdimov2/status/1462802234761170949
#include <array>
#include <string_view>
#include <string>
#include <iostream>
// For now experimental/reflect is not available generally, but
// it should be standardized for C++ 26 ???? Still, we have
// access to it with the very latest llvm.
#include <experimental/reflect>
#################
# This starts a web server listening on port 8001, with debugging turned n.
# This should not be be used to run the chatbot on a public website: it is meant
# for testing purposes only.
#################
from flask import Flask, request, jsonify
from flask import Flask, render_template, request, url_for
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
@lemire
lemire / embedding.py
Created December 6, 2023 03:12
generate embeddings
###############
# You should basically never use this program. It is only for generating the embeddings for your ChatBot.
# If you want to run the ChatBot, see web_app.py
###############
import os
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas
import openai
import glob
import time
@lemire
lemire / testcurl.c
Created November 21, 2023 19:20
C program to test curl URL normalization
// cc test.c -lcurl -o testcurl && ./testcurl
#include <curl/curl.h>
#include <stdio.h>
int main() {
CURLU *url = curl_url();
CURLUcode rc = curl_url_set(
url, CURLUPART_URL, "https://www.7‑Eleven.com/Home/Privacy/Montréal", 0);
// Returns a CURLUcode error value, which is (0) if everything went fine.
if (rc == 0) {
char *buffer;
@lemire
lemire / demosimdjson.cpp
Last active October 25, 2023 20:44
demosimdjson
#include "simdjson.h"
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
using namespace simdjson;
std::string read_file(const std::string &filename) {
std::ifstream file(filename);
@lemire
lemire / test.js
Created September 29, 2023 22:23
"use strict";
import { bench, run } from "mitata";
import { existsSync, createWriteStream, readFileSync, mkdirSync } from "node:fs";
import path from "node:path";
import axios from "axios";
const fixturesFolderPath = new URL('fixtures', import.meta.url).pathname;
const urls = [
"https://github.com/ada-url/url-various-datasets/blob/main/files/isaacs_files.txt",