HParams dashboard는 다른 하이퍼파라미터로 한 training run 수십 개 비교. Search space 정의, 각 run의 하이퍼파라미터랑 최종 metric 로깅하면서 실험, 그 다음 parallel coordinate plot, scatter plot, 정렬 가능한 테이블로 뭐가 작동하는지 탐색.
TensorBoard Profiler는 성능 병목 식별. 대부분 TF 성능 문제는 느린 compute가 아니라 느린 데이터 로딩. Profiler의 Input Pipeline Analyzer가 데이터 대기 시간 vs model 계산 시간 비율 알려줘. "Input Bound: 80%"가 나오면 tf.data pipeline이 문제지 model이 아냐.
Code
HParams sweep·python
from tensorboard.plugins.hparams import api as hp
import tensorflow as tf
# Define the search space
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([32, 64, 128]))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.1, 0.5))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))
METRIC_ACC = 'accuracy'
with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
hp.hparams_config(
hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER],
metrics=[hp.Metric(METRIC_ACC, display_name='Validation Accuracy')],
)
def train_test_model(hparams):
model = tf.keras.Sequential([
tf.keras.layers.Dense(hparams[HP_NUM_UNITS], activation='relu',
input_shape=(784,)),
tf.keras.layers.Dropout(hparams[HP_DROPOUT]),
tf.keras.layers.Dense(10, activation='softmax'),
])
model.compile(optimizer=hparams[HP_OPTIMIZER],
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5, verbose=0)
_, accuracy = model.evaluate(x_test, y_test, verbose=0)
return accuracy
session_num = 0
for num_units in HP_NUM_UNITS.domain.values:
for dropout in [0.1, 0.3, 0.5]:
for optimizer in HP_OPTIMIZER.domain.values:
hparams = {
HP_NUM_UNITS: num_units, HP_DROPOUT: dropout,
HP_OPTIMIZER: optimizer,
}
run_dir = f"logs/hparam_tuning/run-{session_num:03d}"
with tf.summary.create_file_writer(run_dir).as_default():
hp.hparams(hparams)
accuracy = train_test_model(hparams)
tf.summary.scalar(METRIC_ACC, accuracy, step=1)
session_num += 1
Profiler — find the bottleneck·python
# pip install tensorboard_plugin_profile
import tensorflow as tf
# Method 1: profile a batch range via callback
tb_callback = tf.keras.callbacks.TensorBoard(
log_dir='logs/profile',
profile_batch='5, 15', # profile batches 5 through 15
)
model.fit(train_ds, epochs=3, callbacks=[tb_callback])
# Method 2: programmatic
tf.profiler.experimental.start('logs/profile')
model.fit(train_ds, epochs=1)
tf.profiler.experimental.stop()
# Method 3: context manager
with tf.profiler.experimental.Profile('logs/profile'):
model.fit(train_ds, epochs=1)
Progress
Progress is local-only — sign in to sync across devices.