#!/usr/bin/env python3 # ------------------------------------------------------------- # paraphrase_batch.py # # Run it once per dataset and it will automatically iterate over # the models you listed, writing a separate JSONL for each. # # Usage: # python paraphrase_batch.py --input refusal_questions.jsonl \ # --out-dir ./paraphrases # # ------------------------------------------------------------- import json, random, time from pathlib import Path import requests from tqdm.auto import tqdm # ---------- CONFIG ---------- API_URL = "http://10.0.0.193:123/v1" PROMPT_TEMPLATE = ( "Paraphrasiere den folgenden Satz auf natürliche, menschliche Weise auf Deutsch.\n\n" "Original:\n{text}\n\nParaphrase:" ) MAX_TOKENS = 256 TEMPERATURE_RANGE = (0.45, 0.8) # per request # ---------- MODEL LIST ---------- MODELS = [ ("Qwen3-30B-A3B"), ("Mistral Small 3.2"), ("Gemma3-27B"), ("gpt-oss-20b") ] # ---------- HELPERS ---------- def paraphrase_batch(texts, model_name, temperatures): """Send a batch of texts to the API and return list of paraphrases.""" payload = { "model": model_name, "prompt": [PROMPT_TEMPLATE.format(text=t) for t in texts], "temperature": temperatures, "max_tokens": MAX_TOKENS, "n": 1 } resp = requests.post(f"{API_URL}/completions", json=payload) if resp.status_code != 200: raise RuntimeError(f"API error {resp.status_code}: {resp.text}") return [c["text"].strip() for c in resp.json()["choices"]] # ---------- MAIN PARAPHRASE ---------- def process_model(model_name, output_path: Path, messages): """Paraphrase all *user* messages with a single model.""" out_f = output_path.open("w", encoding="utf-8") i = 0 pbar = tqdm(total=len(messages)//2, desc=model_name) while i < len(messages) - 1: user_msg = messages[i] assistant_msg= messages[i + 1] # skip malformed pairs if user_msg["role"] != "user" or assistant_msg["role"] != "assistant": i += 1 continue # write original pair once out_f.write(json.dumps(user_msg, ensure_ascii=False) + "\n") out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n") # generate paraphrases for this question n_variants = random.randint(3, 10) texts = [user_msg["content"]] * n_variants temps = [round(random.uniform(*TEMPERATURE_RANGE), 2) for _ in range(n_variants)] try: paraphrases = paraphrase_batch(texts, model_name, temps) except Exception as e: print(f"\n⚠️ Fehler bei {user_msg['content']!r}: {e}") # fallback: reuse original text paraphrases = [user_msg["content"]] * n_variants for p in paraphrases: out_f.write(json.dumps({"role": "user", "content": p}, ensure_ascii=False) + "\n") out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n") i += 2 pbar.update(1) time.sleep(0.05) # gentle rate‑limit out_f.close() print(f"✅ {output_path} fertig – {(len(messages)//2)*(1+n_variants)} Zeilen") # ---------- MAIN ENTRY ---------- def main(): import argparse parser = argparse.ArgumentParser( description="Generate paraphrases for a JSONL of user/assistant pairs using multiple models." ) parser.add_argument("--input", type=Path, default="refusal_questions.jsonl", help="Input file (JSON array of messages)") parser.add_argument("--out-dir", type=Path, default="./paraphrases", help="Directory to write the per‑model JSONL files") args = parser.parse_args() # Ensure output directory exists args.out_dir.mkdir(parents=True, exist_ok=True) # Load all messages once – works for < 1 M lines comfortably print(f"Loading {args.input} …") with args.input.open("r", encoding="utf-8") as f: messages = json.load(f) # expects a JSON array # Iterate over the models automatically for model_name in MODELS: output_path = args.out_dir / f"{model_name}.jsonl" process_model(model_name, output_path, messages) if __name__ == "__main__": main()