Grid-based synthetic generator that produces real diversity·pythonimport json, itertools
from openai import OpenAI
teacher = OpenAI()
TOPICS = ["refund request", "shipping delay", "product defect",
"account locked", "billing question"]
DIFFICULTIES = ["clear request", "ambiguous", "angry", "polite-but-vague"]
LENGTHS = ["one sentence", "a short paragraph", "multiple paragraphs"]
SYSTEM = """You generate realistic customer-support training pairs.
Output JSON: {"user": "...", "assistant": "..."}.
The assistant should be empathetic, concise, and follow the company's
formal-but-warm voice."""
def generate_pair(topic: str, difficulty: str, length: str) -> dict:
prompt = (f"Topic: {topic}. Customer mood: {difficulty}. "
f"Customer message length: {length}. Generate one realistic pair.")
r = teacher.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": SYSTEM},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
temperature=0.8,
)
pair = json.loads(r.choices[0].message.content)
return {"messages": [
{"role": "system", "content": "You are a customer support agent."},
{"role": "user", "content": pair["user"]},
{"role": "assistant", "content": pair["assistant"]},
]}
with open("synthetic.jsonl", "w") as f:
for topic, diff, length in itertools.product(TOPICS, DIFFICULTIES, LENGTHS):
ex = generate_pair(topic, diff, length)
f.write(json.dumps(ex) + "\n")
print("Generated", len(TOPICS) * len(DIFFICULTIES) * len(LENGTHS), "examples")