SC-Discord-Bot/llm_training/paraphrase_chatml.py

#!/usr/bin/env python3
# -------------------------------------------------------------
# paraphrase_batch.py
#
# Run it once per dataset and it will automatically iterate over
# the models you listed, writing a separate JSONL for each.
#
# Usage:
#   python paraphrase_batch.py --input refusal_questions.jsonl \
#                              --out-dir ./paraphrases
#
# -------------------------------------------------------------
import json, random, time
from pathlib import Path
import requests
from tqdm.auto import tqdm

# ---------- CONFIG ----------
API_URL = "http://10.0.0.193:123/v1"

PROMPT_TEMPLATE = (
    "Paraphrasiere den folgenden Satz auf natürliche, menschliche Weise auf Deutsch.\n\n"
    "Original:\n{text}\n\nParaphrase:"
)

MAX_TOKENS        = 256
TEMPERATURE_RANGE = (0.45, 0.8)          # per request

# ---------- MODEL LIST ----------
MODELS = [
    ("Qwen3-30B-A3B"),
    ("Mistral Small 3.2"),
    ("Gemma3-27B"),
    ("gpt-oss-20b")
]

# ---------- HELPERS ----------
def paraphrase_batch(texts, model_name, temperatures):
    """Send a batch of texts to the API and return list of paraphrases."""
    payload = {
        "model": model_name,
        "prompt": [PROMPT_TEMPLATE.format(text=t) for t in texts],
        "temperature": temperatures,
        "max_tokens": MAX_TOKENS,
        "n": 1
    }
    resp = requests.post(f"{API_URL}/completions", json=payload)
    if resp.status_code != 200:
        raise RuntimeError(f"API error {resp.status_code}: {resp.text}")
    return [c["text"].strip() for c in resp.json()["choices"]]

# ---------- MAIN PARAPHRASE ----------
def process_model(model_name, output_path: Path, messages):
    """Paraphrase all *user* messages with a single model."""
    out_f = output_path.open("w", encoding="utf-8")

    i = 0
    pbar = tqdm(total=len(messages)//2, desc=model_name)

    while i < len(messages) - 1:
        user_msg     = messages[i]
        assistant_msg= messages[i + 1]

        # skip malformed pairs
        if user_msg["role"] != "user" or assistant_msg["role"] != "assistant":
            i += 1
            continue

        # write original pair once
        out_f.write(json.dumps(user_msg, ensure_ascii=False) + "\n")
        out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")

        # generate paraphrases for this question
        n_variants = random.randint(3, 10)
        texts      = [user_msg["content"]] * n_variants
        temps      = [round(random.uniform(*TEMPERATURE_RANGE), 2) for _ in range(n_variants)]

        try:
            paraphrases = paraphrase_batch(texts, model_name, temps)
        except Exception as e:
            print(f"\n⚠️  Fehler bei {user_msg['content']!r}: {e}")
            # fallback: reuse original text
            paraphrases = [user_msg["content"]] * n_variants

        for p in paraphrases:
            out_f.write(json.dumps({"role": "user", "content": p}, ensure_ascii=False) + "\n")
            out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")

        i += 2
        pbar.update(1)
        time.sleep(0.05)   # gentle rate‑limit

    out_f.close()
    print(f"✅ {output_path} fertig – {(len(messages)//2)*(1+n_variants)} Zeilen")

# ---------- MAIN ENTRY ----------
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="Generate paraphrases for a JSONL of user/assistant pairs using multiple models."
    )
    parser.add_argument("--input", type=Path, default="refusal_questions.jsonl",
                        help="Input file (JSON array of messages)")
    parser.add_argument("--out-dir", type=Path, default="./paraphrases",
                        help="Directory to write the per‑model JSONL files")
    args = parser.parse_args()

    # Ensure output directory exists
    args.out_dir.mkdir(parents=True, exist_ok=True)

    # Load all messages once – works for < 1 M lines comfortably
    print(f"Loading {args.input} …")
    with args.input.open("r", encoding="utf-8") as f:
        messages = json.load(f)   # expects a JSON array

    # Iterate over the models automatically
    for model_name in MODELS:
        output_path = args.out_dir / f"{model_name}.jsonl"
        process_model(model_name, output_path, messages)

if __name__ == "__main__":
    main()