RodriMora

## gist:280c3edde75d0fd99a27f5e4c42622bf
// ==UserScript==
// @name         YouTube Auto-Theater by Window Width (robust)
// @namespace    rodrigo.mora.autotheater
// @version      0.2
// @description  Toggle Theater mode automatically based on browser window width; works on reloads + SPA navigations.
// @match        https://www.youtube.com/*
// @exclude      https://www.youtube.com/embed/*
// @run-at       document-idle
// @grant        none
// @noframes

## gist:db422a0cd63eb44f535c3fda410d58a5
{"data_index": 1, "request": {"model": "deepseek-reasoner", "messages": [{"role": "system", "content": "你是 Kimi，由月之暗面科技有限公司（MoonshotAI）开发的人工智能助手。\n你的回答应该遵守中华人民共和国的法律。遇到比大小、数数等逻辑问题请一步一步思考再回答。\n\n今天是：2025年08月15日 星期五\n"}, {"role": "user", "content": "The livestock sector urgently needs digital transformation—leveraging IoT, big data, and AI—to boost efficiency and sustainability.这个表述地道吗"}, {"role": "assistant", "content": "整体来说，这句英文已经很流畅，但“地道”程度还可以再微调，让语气更自然、符合英语母语者的表达方式。下面给出几种更口语/书面化的改写，供你根据场景取舍：\n\n1. 简洁口语版  \n“The livestock industry badly needs a digital overhaul—think IoT, big data and AI—to become both more efficient and sustainable.”\n\n2. 书面正式版  \n“The livestock sector urgently requires comprehensive digital transformation, leveraging IoT, big-data analytics and artificial intelligence, to drive efficiency and sustainability.”\n\n3. 略带宣传口吻的商务版  \n“Digital transformation—powered by IoT, big data and AI—is no longer optional for the livestock sector; it’s the key to unlocking higher efficiency and long-term

## gist:0af77e35f55709e0ef5b7c45361b906b
im testing the new qwen3next model just released with vllm
https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct

CUDA0 and CUDA4= nvidia 5090 (32gb vram)
CUDA2= RTX PRO 6000 BLACKWELL (32gb vram)
CUDA1 = nvidia 3090 (24gb vram)

But got this error:
TORCHDYNAMO_VERBOSE=1 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,4,1 VLLM_PP_LAYER_PARTITION="8,29,8,3" VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 vllm serve /mnt/llms/models/Qwen/Qwen3-Next-80B-A3B-Instruct --port 8000 --max-model-len 8192 -pp 4

## gist:e3b6967d915e5625eefce17447190be8
VLLM_COMPILE_LEVEL=0 TORCHDYNAMO_VERBOSE=1 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,4,1 VLLM_PP_LAYER_PARTITION="8,29,8,3" VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 vllm serve /mnt/llms/models/Qwen/Qwen3-Next-80B-A3B-Instruct --port 8000 --max-model-len 8192 -pp 4 --enforce-eager
INFO 09-11 22:50:23 [__init__.py:216] Automatically detected platform cuda.
(APIServer pid=290069) INFO 09-11 22:50:25 [api_server.py:1896] vLLM API server version 0.10.2rc2.dev313+g79ac59f32
(APIServer pid=290069) INFO 09-11 22:50:25 [utils.py:328] non-default args: {'model_tag': '/mnt/llms/models/Qwen/Qwen3-Next-80B-A3B-Instruct', 'model': '/mnt/llms/models/Qwen/Qwen3-Next-80B-A3B-Instruct', 'max_model_len': 8192, 'enforce_eager': True, 'pipeline_parallel_size': 4}
(APIServer pid=290069) INFO 09-11 22:50:33 [__init__.py:742] Resolved architecture: Qwen3NextForCausalLM
(APIServer pid=290069) `torch_dtype` is deprecated! Use `dtype` instead!
(APIServer pid=290069) INFO 09-11 22:50:33 [__init__.py:1815] Using max model len 8192
(AP

## gist:0f5ae0bfcb485228c49e623e41e0edb8
sources:
  - id: 4
    model_dir: /mnt/llms/models/bullerwins/GLM-4.5-exl3-4.0bpw
  - id: 5
    model_dir: /mnt/llms/models/bullerwins/GLM-4.5-exl3-5.0bpw

overrides:
  #   Attention & router tensors – cheap, big gain on MoE models
  - key: "*.self_attn.*"
    source: 5          # +2 bpw

## gist:099913a7cea971d1bd09c623fc12c7bf
import openai

openai.api_key = "local-key"
openai.base_url = "http://192.168.10.115:5000/v1/"     # adjust to your proxy

tools = [
  {
    "type": "function",
    "function": {
      "name": "add",
	// ==UserScript==
	// @name YouTube Auto-Theater by Window Width (robust)
	// @namespace rodrigo.mora.autotheater
	// @version 0.2
	// @description Toggle Theater mode automatically based on browser window width; works on reloads + SPA navigations.
	// @match https://www.youtube.com/*
	// @exclude https://www.youtube.com/embed/*
	// @run-at document-idle
	// @grant none
	// @noframes
	im testing the new qwen3next model just released with vllm
	https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct

	CUDA0 and CUDA4= nvidia 5090 (32gb vram)
	CUDA2= RTX PRO 6000 BLACKWELL (32gb vram)
	CUDA1 = nvidia 3090 (24gb vram)

	But got this error:
	TORCHDYNAMO_VERBOSE=1 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,4,1 VLLM_PP_LAYER_PARTITION="8,29,8,3" VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 vllm serve /mnt/llms/models/Qwen/Qwen3-Next-80B-A3B-Instruct --port 8000 --max-model-len 8192 -pp 4
	VLLM_COMPILE_LEVEL=0 TORCHDYNAMO_VERBOSE=1 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,4,1 VLLM_PP_LAYER_PARTITION="8,29,8,3" VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 vllm serve /mnt/llms/models/Qwen/Qwen3-Next-80B-A3B-Instruct --port 8000 --max-model-len 8192 -pp 4 --enforce-eager
	INFO 09-11 22:50:23 [__init__.py:216] Automatically detected platform cuda.
	(APIServer pid=290069) INFO 09-11 22:50:25 [api_server.py:1896] vLLM API server version 0.10.2rc2.dev313+g79ac59f32
	(APIServer pid=290069) INFO 09-11 22:50:25 [utils.py:328] non-default args: {'model_tag': '/mnt/llms/models/Qwen/Qwen3-Next-80B-A3B-Instruct', 'model': '/mnt/llms/models/Qwen/Qwen3-Next-80B-A3B-Instruct', 'max_model_len': 8192, 'enforce_eager': True, 'pipeline_parallel_size': 4}
	(APIServer pid=290069) INFO 09-11 22:50:33 [__init__.py:742] Resolved architecture: Qwen3NextForCausalLM
	(APIServer pid=290069) `torch_dtype` is deprecated! Use `dtype` instead!
	(APIServer pid=290069) INFO 09-11 22:50:33 [__init__.py:1815] Using max model len 8192
	(AP
	sources:
	- id: 4
	model_dir: /mnt/llms/models/bullerwins/GLM-4.5-exl3-4.0bpw
	- id: 5
	model_dir: /mnt/llms/models/bullerwins/GLM-4.5-exl3-5.0bpw

	overrides:
	# Attention & router tensors – cheap, big gain on MoE models
	- key: ".self_attn."
	source: 5 # +2 bpw
	import openai

	openai.api_key = "local-key"
	openai.base_url = "http://192.168.10.115:5000/v1/" # adjust to your proxy

	tools = [
	{
	"type": "function",
	"function": {
	"name": "add",