Add Align & Mine event dataset and loot table, implement deduplication and paraphrasing scripts

- Created a new JSONL file for the Align & Mine event dataset containing user-assistant interactions. - Added a loot table JSONL file detailing various items and their acquisition methods. - Implemented a deduplication script to filter unique user messages from a dataset. - Developed a paraphrasing script to generate variations of user messages using multiple models.
2025-08-23 12:58:39 +02:00
parent e9d6947880
commit 06086f05b3
5 changed files with 976 additions and 0 deletions
--- a/llm_training/paraphrase_chatml.py
+++ b/llm_training/paraphrase_chatml.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+# paraphrase_batch.py
+#
+# Run it once per dataset and it will automatically iterate over
+# the models you listed, writing a separate JSONL for each.
+#
+# Usage:
+#   python paraphrase_batch.py --input refusal_questions.jsonl \
+#                              --out-dir ./paraphrases
+#
+# -------------------------------------------------------------
+import json, random, time
+from pathlib import Path
+import requests
+from tqdm.auto import tqdm
+
+# ---------- CONFIG ----------
+API_URL = "http://10.0.0.193:123/v1"
+
+PROMPT_TEMPLATE = (
+    "Paraphrasiere den folgenden Satz auf natürliche, menschliche Weise auf Deutsch.\n\n"
+    "Original:\n{text}\n\nParaphrase:"
+)
+
+MAX_TOKENS        = 256
+TEMPERATURE_RANGE = (0.45, 0.8)          # per request
+
+# ---------- MODEL LIST ----------
+MODELS = [
+    ("Qwen3-30B-A3B"),
+    ("Mistral Small 3.2"),
+    ("Gemma3-27B"),
+    ("gpt-oss-20b")
+]
+
+# ---------- HELPERS ----------
+def paraphrase_batch(texts, model_name, temperatures):
+    """Send a batch of texts to the API and return list of paraphrases."""
+    payload = {
+        "model": model_name,
+        "prompt": [PROMPT_TEMPLATE.format(text=t) for t in texts],
+        "temperature": temperatures,
+        "max_tokens": MAX_TOKENS,
+        "n": 1
+    }
+    resp = requests.post(f"{API_URL}/completions", json=payload)
+    if resp.status_code != 200:
+        raise RuntimeError(f"API error {resp.status_code}: {resp.text}")
+    return [c["text"].strip() for c in resp.json()["choices"]]
+
+# ---------- MAIN PARAPHRASE ----------
+def process_model(model_name, output_path: Path, messages):
+    """Paraphrase all *user* messages with a single model."""
+    out_f = output_path.open("w", encoding="utf-8")
+
+    i = 0
+    pbar = tqdm(total=len(messages)//2, desc=model_name)
+
+    while i < len(messages) - 1:
+        user_msg     = messages[i]
+        assistant_msg= messages[i + 1]
+
+        # skip malformed pairs
+        if user_msg["role"] != "user" or assistant_msg["role"] != "assistant":
+            i += 1
+            continue
+
+        # write original pair once
+        out_f.write(json.dumps(user_msg, ensure_ascii=False) + "\n")
+        out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")
+
+        # generate paraphrases for this question
+        n_variants = random.randint(3, 10)
+        texts      = [user_msg["content"]] * n_variants
+        temps      = [round(random.uniform(*TEMPERATURE_RANGE), 2) for _ in range(n_variants)]
+
+        try:
+            paraphrases = paraphrase_batch(texts, model_name, temps)
+        except Exception as e:
+            print(f"\n⚠️  Fehler bei {user_msg['content']!r}: {e}")
+            # fallback: reuse original text
+            paraphrases = [user_msg["content"]] * n_variants
+
+        for p in paraphrases:
+            out_f.write(json.dumps({"role": "user", "content": p}, ensure_ascii=False) + "\n")
+            out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")
+
+        i += 2
+        pbar.update(1)
+        time.sleep(0.05)   # gentle rate‑limit
+
+    out_f.close()
+    print(f"✅ {output_path} fertig – {(len(messages)//2)*(1+n_variants)} Zeilen")
+
+# ---------- MAIN ENTRY ----------
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Generate paraphrases for a JSONL of user/assistant pairs using multiple models."
+    )
+    parser.add_argument("--input", type=Path, default="refusal_questions.jsonl",
+                        help="Input file (JSON array of messages)")
+    parser.add_argument("--out-dir", type=Path, default="./paraphrases",
+                        help="Directory to write the per‑model JSONL files")
+    args = parser.parse_args()
+
+    # Ensure output directory exists
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load all messages once – works for < 1 M lines comfortably
+    print(f"Loading {args.input} …")
+    with args.input.open("r", encoding="utf-8") as f:
+        messages = json.load(f)   # expects a JSON array
+
+    # Iterate over the models automatically
+    for model_name in MODELS:
+        output_path = args.out_dir / f"{model_name}.jsonl"
+        process_model(model_name, output_path, messages)
+
+if __name__ == "__main__":
+    main()