cpfiffer/r1-structured.py

## r1-structured.py
import time
from typing import Literal
import outlines
import re
import torch
from transformers import AutoTokenizer
from outlines.fsm.json_schema import convert_json_schema_to_str
from outlines_core.fsm.json_schema import build_regex_from_schema
from pydantic import BaseModel

# Loading the model.
# model_string = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'
model_string = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B' # For smaller machines

# We'll use the vllm backend for this model, but you can use any other backend.
model = outlines.models.transformers(model_string, device='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_string)

# Set up response format you want the LLM to respond with.
class YesNo(BaseModel):
    answer: Literal['yes', 'no']

yesno_regex = build_regex_from_schema(convert_json_schema_to_str(YesNo))

# Add the thinking prefix to the regex
thinking_regex = r'<think>([^<]|<[^\/]|<\/[^t]|<\/t[^h]|<\/th[^i]|<\/thi[^n]|<\/thin[^k]|<\/think[^>])*<\/think>\n'

result_regex = thinking_regex + yesno_regex
print(result_regex)

# Apply the chat template
prompt = tokenizer.apply_chat_template(
    [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user', 'content': 'Roses are red. Violets are blue. Are roses and violets the same color? Yes or no. Provide a chain of thought inside a <think> tag, closing with </think> when you are finished. After, please write JSON with the following schema: {"answer": "yes" | "no"}'},
    ],
    tokenize=False,
    add_generation_prompt=True,
)

# Generator
start_time = time.time()
generator = outlines.generate.regex(model, result_regex)
end_time = time.time()
print(f"Time taken to create generator: {end_time - start_time} seconds")

# Generate the response
result = generator(prompt, max_tokens=1000)
print(result)

# Parse out the thinking + structured result
thinking_result = re.search(thinking_regex, result).group(1).strip()
structured_result = re.search(yesno_regex, result).group(0).strip()

# Print the result. The first group is the thinking, the second is the structured result.
print("Chain of thought")
print("----------------")
print(thinking_result)
print("\nStructured output")
print("----------------")
print(structured_result)

# Parse the structured result
output = YesNo.model_validate_json(structured_result)

print("\nPydantic output")
print("----------------")
print(output)
	import time
	from typing import Literal
	import outlines
	import re
	import torch
	from transformers import AutoTokenizer
	from outlines.fsm.json_schema import convert_json_schema_to_str
	from outlines_core.fsm.json_schema import build_regex_from_schema
	from pydantic import BaseModel

	# Loading the model.
	# model_string = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'
	model_string = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B' # For smaller machines

	# We'll use the vllm backend for this model, but you can use any other backend.
	model = outlines.models.transformers(model_string, device='cuda')
	tokenizer = AutoTokenizer.from_pretrained(model_string)

	# Set up response format you want the LLM to respond with.
	class YesNo(BaseModel):
	answer: Literal['yes', 'no']

	yesno_regex = build_regex_from_schema(convert_json_schema_to_str(YesNo))

	# Add the thinking prefix to the regex
	thinking_regex = r'<think>([^<]\|<[^\/]\|<\/[^t]\|<\/t[^h]\|<\/th[^i]\|<\/thi[^n]\|<\/thin[^k]\|<\/think[^>])*<\/think>\n'

	result_regex = thinking_regex + yesno_regex
	print(result_regex)

	# Apply the chat template
	prompt = tokenizer.apply_chat_template(
	[
	{'role': 'system', 'content': 'You are a helpful assistant.'},
	{'role': 'user', 'content': 'Roses are red. Violets are blue. Are roses and violets the same color? Yes or no. Provide a chain of thought inside a <think> tag, closing with </think> when you are finished. After, please write JSON with the following schema: {"answer": "yes" \| "no"}'},
	],
	tokenize=False,
	add_generation_prompt=True,
	)

	# Generator
	start_time = time.time()
	generator = outlines.generate.regex(model, result_regex)
	end_time = time.time()
	print(f"Time taken to create generator: {end_time - start_time} seconds")

	# Generate the response
	result = generator(prompt, max_tokens=1000)
	print(result)

	# Parse out the thinking + structured result
	thinking_result = re.search(thinking_regex, result).group(1).strip()
	structured_result = re.search(yesno_regex, result).group(0).strip()

	# Print the result. The first group is the thinking, the second is the structured result.
	print("Chain of thought")
	print("----------------")
	print(thinking_result)
	print("\nStructured output")
	print("----------------")
	print(structured_result)

	# Parse the structured result
	output = YesNo.model_validate_json(structured_result)

	print("\nPydantic output")
	print("----------------")
	print(output)