Add Align & Mine event dataset and loot table, implement deduplication and paraphrasing scripts

- Created a new JSONL file for the Align & Mine event dataset containing user-assistant interactions.
- Added a loot table JSONL file detailing various items and their acquisition methods.
- Implemented a deduplication script to filter unique user messages from a dataset.
- Developed a paraphrasing script to generate variations of user messages using multiple models.
This commit is contained in:
2025-08-23 12:58:39 +02:00
parent e9d6947880
commit 06086f05b3
5 changed files with 976 additions and 0 deletions

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python3
# -------------------------------------------------------------
# paraphrase_batch.py
#
# Run it once per dataset and it will automatically iterate over
# the models you listed, writing a separate JSONL for each.
#
# Usage:
# python paraphrase_batch.py --input refusal_questions.jsonl \
# --out-dir ./paraphrases
#
# -------------------------------------------------------------
import json, random, time
from pathlib import Path
import requests
from tqdm.auto import tqdm
# ---------- CONFIG ----------
API_URL = "http://10.0.0.193:123/v1"
PROMPT_TEMPLATE = (
"Paraphrasiere den folgenden Satz auf natürliche, menschliche Weise auf Deutsch.\n\n"
"Original:\n{text}\n\nParaphrase:"
)
MAX_TOKENS = 256
TEMPERATURE_RANGE = (0.45, 0.8) # per request
# ---------- MODEL LIST ----------
MODELS = [
("Qwen3-30B-A3B"),
("Mistral Small 3.2"),
("Gemma3-27B"),
("gpt-oss-20b")
]
# ---------- HELPERS ----------
def paraphrase_batch(texts, model_name, temperatures):
"""Send a batch of texts to the API and return list of paraphrases."""
payload = {
"model": model_name,
"prompt": [PROMPT_TEMPLATE.format(text=t) for t in texts],
"temperature": temperatures,
"max_tokens": MAX_TOKENS,
"n": 1
}
resp = requests.post(f"{API_URL}/completions", json=payload)
if resp.status_code != 200:
raise RuntimeError(f"API error {resp.status_code}: {resp.text}")
return [c["text"].strip() for c in resp.json()["choices"]]
# ---------- MAIN PARAPHRASE ----------
def process_model(model_name, output_path: Path, messages):
"""Paraphrase all *user* messages with a single model."""
out_f = output_path.open("w", encoding="utf-8")
i = 0
pbar = tqdm(total=len(messages)//2, desc=model_name)
while i < len(messages) - 1:
user_msg = messages[i]
assistant_msg= messages[i + 1]
# skip malformed pairs
if user_msg["role"] != "user" or assistant_msg["role"] != "assistant":
i += 1
continue
# write original pair once
out_f.write(json.dumps(user_msg, ensure_ascii=False) + "\n")
out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")
# generate paraphrases for this question
n_variants = random.randint(3, 10)
texts = [user_msg["content"]] * n_variants
temps = [round(random.uniform(*TEMPERATURE_RANGE), 2) for _ in range(n_variants)]
try:
paraphrases = paraphrase_batch(texts, model_name, temps)
except Exception as e:
print(f"\n⚠️ Fehler bei {user_msg['content']!r}: {e}")
# fallback: reuse original text
paraphrases = [user_msg["content"]] * n_variants
for p in paraphrases:
out_f.write(json.dumps({"role": "user", "content": p}, ensure_ascii=False) + "\n")
out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")
i += 2
pbar.update(1)
time.sleep(0.05) # gentle ratelimit
out_f.close()
print(f"{output_path} fertig {(len(messages)//2)*(1+n_variants)} Zeilen")
# ---------- MAIN ENTRY ----------
def main():
import argparse
parser = argparse.ArgumentParser(
description="Generate paraphrases for a JSONL of user/assistant pairs using multiple models."
)
parser.add_argument("--input", type=Path, default="refusal_questions.jsonl",
help="Input file (JSON array of messages)")
parser.add_argument("--out-dir", type=Path, default="./paraphrases",
help="Directory to write the permodel JSONL files")
args = parser.parse_args()
# Ensure output directory exists
args.out_dir.mkdir(parents=True, exist_ok=True)
# Load all messages once works for < 1M lines comfortably
print(f"Loading {args.input}")
with args.input.open("r", encoding="utf-8") as f:
messages = json.load(f) # expects a JSON array
# Iterate over the models automatically
for model_name in MODELS:
output_path = args.out_dir / f"{model_name}.jsonl"
process_model(model_name, output_path, messages)
if __name__ == "__main__":
main()