vwxyzjn/repro.py

## repro.py
data = [{'content': 'The wages earned by Robin is 30% more than that earned by Erica. The wages earned by Charles is 60% more than that earned by Erica. How much percent is the wages earned by Charles more than that earned by Robin?', 'role': 'user'}, {'content': "Let's assume Erica earns a wage of $E.\n\nRobin earns 30% more than Erica, so Robin's wage is:\nRobin's wage = E + 30% of E\nRobin's wage = E + 0.30E\nRobin's wage = 1.30E\n\nCharles earns 60% more than Erica, so Charles's wage is:\nCharles's wage = E + 60% of E\nCharles's wage = E + 0.60E\nCharles's wage = 1.60E\n\nNow, we want to find out how much percent the wages earned by Charles is more than that earned by Robin. To do this, we calculate the difference between Charles's and Robin's wages and then find out what percentage this difference is of Robin's wages.\n\nDifference in wages = Charles's wage - Robin's wage\nDifference in wages = 1.60E - 1.30E\nDifference in wages = 0.30E\n\nNow, we find out what percentage this difference is of Robin's wages:\n\nPercentage difference = (Difference in wages / Robin's wage) * 100\nPercentage difference = (0.30E / 1.30E) * 100\nPercentage difference = (0.30 / 1.30) * 100\nPercentage difference ≈ 23.08%\n\nSo, the wages earned by Charles is approximately $\\boxed{23.08\\%}$  more than that earned by Robin.", 'role': 'assistant'}]
from transformers import AutoTokenizer
base_model = "deepseek-ai/deepseek-math-7b-base"
print(f"{base_model=}")
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.chat_template = "{% for message in messages %}{% if (message['role'] == 'system')%}{{ '' }}{% elif (message['role'] == 'user')%}{{ '### Problem: ' + message['content'] + '\n' }}{% elif (message['role'] == 'assistant')%}{{ '### Solution: ' + message['content'] + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Solution: ' }}{% endif %}{% endfor %}"

tokens = tokenizer.apply_chat_template(data, add_generation_prompt=False)
print(f"{tokens[-10:]=}")

# fix
tokenizer.chat_template = "{% for message in messages %}{% if (message['role'] == 'system')%}{{ '' }}{% elif (message['role'] == 'user')%}{{ '### Problem: ' + message['content'] + '\n' + eos_token }}{% elif (message['role'] == 'assistant')%}{{ '### Solution: ' + message['content'] + '\n' + eos_token }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Solution: ' }}{% endif %}{% endfor %}"

tokens_with_eos = tokenizer.apply_chat_template(data, add_generation_prompt=False)
print(f"{tokens_with_eos[-10:]=}")

# ed's fix
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.chat_template ="{% for message in messages %}{% if (message['role'] == 'system')%}{{ '' }}{% elif (message['role'] == 'user')%}{{ '### Problem: ' + message['content'] + '\n' }}{% elif (message['role'] == 'assistant')%}{{ '### Solution: ' + message['content'] + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Solution: ' }}{% endif %}{% endfor %}"
tokenizer.add_eos_token = True
add_eos_token_tokens = tokenizer.apply_chat_template(data, add_generation_prompt=False)
print(f"{add_eos_token_tokens[-10:]=}")
add_eos_token_tokens2 = tokenizer(tokenizer.apply_chat_template(data, add_generation_prompt=False, tokenize=False))["input_ids"]
print(f"{add_eos_token_tokens2[-10:]=}")
	data = [{'content': 'The wages earned by Robin is 30% more than that earned by Erica. The wages earned by Charles is 60% more than that earned by Erica. How much percent is the wages earned by Charles more than that earned by Robin?', 'role': 'user'}, {'content': "Let's assume Erica earns a wage of $E.\n\nRobin earns 30% more than Erica, so Robin's wage is:\nRobin's wage = E + 30% of E\nRobin's wage = E + 0.30E\nRobin's wage = 1.30E\n\nCharles earns 60% more than Erica, so Charles's wage is:\nCharles's wage = E + 60% of E\nCharles's wage = E + 0.60E\nCharles's wage = 1.60E\n\nNow, we want to find out how much percent the wages earned by Charles is more than that earned by Robin. To do this, we calculate the difference between Charles's and Robin's wages and then find out what percentage this difference is of Robin's wages.\n\nDifference in wages = Charles's wage - Robin's wage\nDifference in wages = 1.60E - 1.30E\nDifference in wages = 0.30E\n\nNow, we find out what percentage this difference is of Robin's wages:\n\nPercentage difference = (Difference in wages / Robin's wage) * 100\nPercentage difference = (0.30E / 1.30E) * 100\nPercentage difference = (0.30 / 1.30) * 100\nPercentage difference ≈ 23.08%\n\nSo, the wages earned by Charles is approximately $\\boxed{23.08\\%}$ more than that earned by Robin.", 'role': 'assistant'}]
	from transformers import AutoTokenizer
	base_model = "deepseek-ai/deepseek-math-7b-base"
	print(f"{base_model=}")
	tokenizer = AutoTokenizer.from_pretrained(base_model)
	tokenizer.chat_template = "{% for message in messages %}{% if (message['role'] == 'system')%}{{ '' }}{% elif (message['role'] == 'user')%}{{ '### Problem: ' + message['content'] + '\n' }}{% elif (message['role'] == 'assistant')%}{{ '### Solution: ' + message['content'] + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Solution: ' }}{% endif %}{% endfor %}"

	tokens = tokenizer.apply_chat_template(data, add_generation_prompt=False)
	print(f"{tokens[-10:]=}")

	# fix
	tokenizer.chat_template = "{% for message in messages %}{% if (message['role'] == 'system')%}{{ '' }}{% elif (message['role'] == 'user')%}{{ '### Problem: ' + message['content'] + '\n' + eos_token }}{% elif (message['role'] == 'assistant')%}{{ '### Solution: ' + message['content'] + '\n' + eos_token }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Solution: ' }}{% endif %}{% endfor %}"

	tokens_with_eos = tokenizer.apply_chat_template(data, add_generation_prompt=False)
	print(f"{tokens_with_eos[-10:]=}")

	# ed's fix
	tokenizer = AutoTokenizer.from_pretrained(base_model)
	tokenizer.chat_template ="{% for message in messages %}{% if (message['role'] == 'system')%}{{ '' }}{% elif (message['role'] == 'user')%}{{ '### Problem: ' + message['content'] + '\n' }}{% elif (message['role'] == 'assistant')%}{{ '### Solution: ' + message['content'] + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Solution: ' }}{% endif %}{% endfor %}"
	tokenizer.add_eos_token = True
	add_eos_token_tokens = tokenizer.apply_chat_template(data, add_generation_prompt=False)
	print(f"{add_eos_token_tokens[-10:]=}")
	add_eos_token_tokens2 = tokenizer(tokenizer.apply_chat_template(data, add_generation_prompt=False, tokenize=False))["input_ids"]
	print(f"{add_eos_token_tokens2[-10:]=}")