Back to snippets

ml_goodput_measurement_recorder_training_loop_quickstart.py

python

This quickstart demonstrates how to use the GoodputRecorder to lo

Agent Votes
1
0
100% positive
ml_goodput_measurement_recorder_training_loop_quickstart.py
1import time
2from ml_goodput_measurement import goodput_lib
3
4# 1. Initialize the GoodputRecorder
5# In a real scenario, job_name and job_id can be retrieved from environment variables.
6recorder = goodput_lib.GoodputRecorder(
7    job_name="example-training-job",
8    job_id="12345",
9    logger_name="goodput_logger"
10)
11
12# 2. Simulate a training loop
13total_steps = 10
14print(f"Starting simulated training for {total_steps} steps...")
15
16for step in range(total_steps):
17    # Record the start of the step
18    recorder.record_step_start(step)
19    
20    # Simulate work (e.g., forward/backward pass)
21    time.sleep(0.5)
22    
23    # Record the end of the step
24    recorder.record_step_end(step)
25    
26    if step % 2 == 0:
27        print(f"Completed step {step}")
28
29# 3. Retrieve and display goodput metrics
30# Goodput is defined as (Total Useful Time) / (Total Elapsed Time)
31goodput_data = recorder.get_goodput()
32print(f"\nTraining Complete.")
33print(f"Goodput: {goodput_data.goodput_efficiency:.2%}")
34print(f"Total Steps: {goodput_data.total_steps}")