벤치마크 harness·pythonimport httpx, json, time, statistics
OLLAMA = "http://localhost:11434"
def bench(model: str, prompt: str, runs: int = 3) -> dict:
"""모델 warm한 다음 N번 timed iteration."""
# Warmup
httpx.post(f"{OLLAMA}/api/chat", json={
"model": model,
"messages": [{"role": "user", "content": "ok"}],
"stream": False, "options": {"num_predict": 1, "seed": 1},
}, timeout=300.0)
tps_runs, ttft_runs = [], []
for i in range(runs):
t0 = time.time()
first_token_at = None
with httpx.stream("POST", f"{OLLAMA}/api/chat", json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": True, "options": {"seed": 42, "num_predict": 200},
}, timeout=None) as r:
for line in r.iter_lines():
if not line:
continue
chunk = json.loads(line)
if first_token_at is None and chunk.get("message", {}).get("content"):
first_token_at = time.time() - t0
if chunk.get("done"):
ec = chunk.get("eval_count", 0)
ed = chunk.get("eval_duration", 1) or 1
tps_runs.append(ec / (ed / 1e9))
break
ttft_runs.append(first_token_at or 0)
return {
"model": model,
"runs": runs,
"tps_median": statistics.median(tps_runs),
"ttft_median_s": statistics.median(ttft_runs),
}
print(bench("qwen2.5:7b", "Explain unified memory in 4 bullets."))