- Created a new JSONL file for the Align & Mine event dataset containing user-assistant interactions. - Added a loot table JSONL file detailing various items and their acquisition methods. - Implemented a deduplication script to filter unique user messages from a dataset. - Developed a paraphrasing script to generate variations of user messages using multiple models.
124 lines
4.2 KiB
Python
124 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
||
# -------------------------------------------------------------
|
||
# paraphrase_batch.py
|
||
#
|
||
# Run it once per dataset and it will automatically iterate over
|
||
# the models you listed, writing a separate JSONL for each.
|
||
#
|
||
# Usage:
|
||
# python paraphrase_batch.py --input refusal_questions.jsonl \
|
||
# --out-dir ./paraphrases
|
||
#
|
||
# -------------------------------------------------------------
|
||
import json, random, time
|
||
from pathlib import Path
|
||
import requests
|
||
from tqdm.auto import tqdm
|
||
|
||
# ---------- CONFIG ----------
|
||
API_URL = "http://10.0.0.193:123/v1"
|
||
|
||
PROMPT_TEMPLATE = (
|
||
"Paraphrasiere den folgenden Satz auf natürliche, menschliche Weise auf Deutsch.\n\n"
|
||
"Original:\n{text}\n\nParaphrase:"
|
||
)
|
||
|
||
MAX_TOKENS = 256
|
||
TEMPERATURE_RANGE = (0.45, 0.8) # per request
|
||
|
||
# ---------- MODEL LIST ----------
|
||
MODELS = [
|
||
("Qwen3-30B-A3B"),
|
||
("Mistral Small 3.2"),
|
||
("Gemma3-27B"),
|
||
("gpt-oss-20b")
|
||
]
|
||
|
||
# ---------- HELPERS ----------
|
||
def paraphrase_batch(texts, model_name, temperatures):
|
||
"""Send a batch of texts to the API and return list of paraphrases."""
|
||
payload = {
|
||
"model": model_name,
|
||
"prompt": [PROMPT_TEMPLATE.format(text=t) for t in texts],
|
||
"temperature": temperatures,
|
||
"max_tokens": MAX_TOKENS,
|
||
"n": 1
|
||
}
|
||
resp = requests.post(f"{API_URL}/completions", json=payload)
|
||
if resp.status_code != 200:
|
||
raise RuntimeError(f"API error {resp.status_code}: {resp.text}")
|
||
return [c["text"].strip() for c in resp.json()["choices"]]
|
||
|
||
# ---------- MAIN PARAPHRASE ----------
|
||
def process_model(model_name, output_path: Path, messages):
|
||
"""Paraphrase all *user* messages with a single model."""
|
||
out_f = output_path.open("w", encoding="utf-8")
|
||
|
||
i = 0
|
||
pbar = tqdm(total=len(messages)//2, desc=model_name)
|
||
|
||
while i < len(messages) - 1:
|
||
user_msg = messages[i]
|
||
assistant_msg= messages[i + 1]
|
||
|
||
# skip malformed pairs
|
||
if user_msg["role"] != "user" or assistant_msg["role"] != "assistant":
|
||
i += 1
|
||
continue
|
||
|
||
# write original pair once
|
||
out_f.write(json.dumps(user_msg, ensure_ascii=False) + "\n")
|
||
out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")
|
||
|
||
# generate paraphrases for this question
|
||
n_variants = random.randint(3, 10)
|
||
texts = [user_msg["content"]] * n_variants
|
||
temps = [round(random.uniform(*TEMPERATURE_RANGE), 2) for _ in range(n_variants)]
|
||
|
||
try:
|
||
paraphrases = paraphrase_batch(texts, model_name, temps)
|
||
except Exception as e:
|
||
print(f"\n⚠️ Fehler bei {user_msg['content']!r}: {e}")
|
||
# fallback: reuse original text
|
||
paraphrases = [user_msg["content"]] * n_variants
|
||
|
||
for p in paraphrases:
|
||
out_f.write(json.dumps({"role": "user", "content": p}, ensure_ascii=False) + "\n")
|
||
out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")
|
||
|
||
i += 2
|
||
pbar.update(1)
|
||
time.sleep(0.05) # gentle rate‑limit
|
||
|
||
out_f.close()
|
||
print(f"✅ {output_path} fertig – {(len(messages)//2)*(1+n_variants)} Zeilen")
|
||
|
||
# ---------- MAIN ENTRY ----------
|
||
def main():
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description="Generate paraphrases for a JSONL of user/assistant pairs using multiple models."
|
||
)
|
||
parser.add_argument("--input", type=Path, default="refusal_questions.jsonl",
|
||
help="Input file (JSON array of messages)")
|
||
parser.add_argument("--out-dir", type=Path, default="./paraphrases",
|
||
help="Directory to write the per‑model JSONL files")
|
||
args = parser.parse_args()
|
||
|
||
# Ensure output directory exists
|
||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Load all messages once – works for < 1 M lines comfortably
|
||
print(f"Loading {args.input} …")
|
||
with args.input.open("r", encoding="utf-8") as f:
|
||
messages = json.load(f) # expects a JSON array
|
||
|
||
# Iterate over the models automatically
|
||
for model_name in MODELS:
|
||
output_path = args.out_dir / f"{model_name}.jsonl"
|
||
process_model(model_name, output_path, messages)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|