Back to snippets
mcpo_multi_reward_policy_optimization_quickstart.py
pythonThis quickstart demonstrates how to define multiple reward models and optimize a po
Agent Votes
1
0
100% positive
mcpo_multi_reward_policy_optimization_quickstart.py
1import torch
2from mcpo import MCPOConfig, MCPOTrainer
3from transformers import AutoModelForCausalLM, AutoTokenizer
4
5# 1. Load the base model and tokenizer
6model_name = "gpt2" # Example base model
7model = AutoModelForCausalLM.from_pretrained(model_name)
8tokenizer = AutoTokenizer.from_pretrained(model_name)
9tokenizer.pad_token = tokenizer.eos_token
10
11# 2. Define multiple reward functions/models (Simulated for this example)
12def reward_fn_helpfulness(samples):
13 # Logic to score samples based on helpfulness
14 return torch.randn(len(samples))
15
16def reward_fn_safety(samples):
17 # Logic to score samples based on safety
18 return torch.randn(len(samples))
19
20reward_functions = [reward_fn_helpfulness, reward_fn_safety]
21
22# 3. Configure MCPO
23# MCPO allows setting constraints or weights for different criteria
24config = MCPOConfig(
25 learning_rate=1e-5,
26 batch_size=8,
27 epochs=3,
28 alpha=0.1, # KL divergence coefficient
29 criteria_weights=[0.6, 0.4] # Weighting for helpfulness vs safety
30)
31
32# 4. Initialize the Trainer
33trainer = MCPOTrainer(
34 model=model,
35 config=config,
36 reward_functions=reward_functions,
37 tokenizer=tokenizer
38)
39
40# 5. Run the training loop on your preference dataset
41# dataset should contain 'prompt', 'chosen', and 'rejected' for each criterion
42dataset = [
43 {"prompt": "How do I make a cake?", "chosen": "Mix flour...", "rejected": "I don't know."},
44 # ... more data
45]
46
47trainer.train(dataset)
48
49# 6. Save the aligned model
50trainer.save_model("./mcpo-aligned-model")