SC-Discord-Bot/llm_training/paraphrase_chatml.py
Pakobbix 06086f05b3 Add Align & Mine event dataset and loot table, implement deduplication and paraphrasing scripts
- Created a new JSONL file for the Align & Mine event dataset containing user-assistant interactions.
- Added a loot table JSONL file detailing various items and their acquisition methods.
- Implemented a deduplication script to filter unique user messages from a dataset.
- Developed a paraphrasing script to generate variations of user messages using multiple models.
2025-08-23 12:58:39 +02:00

124 lines
4.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -------------------------------------------------------------
# paraphrase_batch.py
#
# Run it once per dataset and it will automatically iterate over
# the models you listed, writing a separate JSONL for each.
#
# Usage:
# python paraphrase_batch.py --input refusal_questions.jsonl \
# --out-dir ./paraphrases
#
# -------------------------------------------------------------
import json, random, time
from pathlib import Path
import requests
from tqdm.auto import tqdm
# ---------- CONFIG ----------
API_URL = "http://10.0.0.193:123/v1"
PROMPT_TEMPLATE = (
"Paraphrasiere den folgenden Satz auf natürliche, menschliche Weise auf Deutsch.\n\n"
"Original:\n{text}\n\nParaphrase:"
)
MAX_TOKENS = 256
TEMPERATURE_RANGE = (0.45, 0.8) # per request
# ---------- MODEL LIST ----------
MODELS = [
("Qwen3-30B-A3B"),
("Mistral Small 3.2"),
("Gemma3-27B"),
("gpt-oss-20b")
]
# ---------- HELPERS ----------
def paraphrase_batch(texts, model_name, temperatures):
"""Send a batch of texts to the API and return list of paraphrases."""
payload = {
"model": model_name,
"prompt": [PROMPT_TEMPLATE.format(text=t) for t in texts],
"temperature": temperatures,
"max_tokens": MAX_TOKENS,
"n": 1
}
resp = requests.post(f"{API_URL}/completions", json=payload)
if resp.status_code != 200:
raise RuntimeError(f"API error {resp.status_code}: {resp.text}")
return [c["text"].strip() for c in resp.json()["choices"]]
# ---------- MAIN PARAPHRASE ----------
def process_model(model_name, output_path: Path, messages):
"""Paraphrase all *user* messages with a single model."""
out_f = output_path.open("w", encoding="utf-8")
i = 0
pbar = tqdm(total=len(messages)//2, desc=model_name)
while i < len(messages) - 1:
user_msg = messages[i]
assistant_msg= messages[i + 1]
# skip malformed pairs
if user_msg["role"] != "user" or assistant_msg["role"] != "assistant":
i += 1
continue
# write original pair once
out_f.write(json.dumps(user_msg, ensure_ascii=False) + "\n")
out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")
# generate paraphrases for this question
n_variants = random.randint(3, 10)
texts = [user_msg["content"]] * n_variants
temps = [round(random.uniform(*TEMPERATURE_RANGE), 2) for _ in range(n_variants)]
try:
paraphrases = paraphrase_batch(texts, model_name, temps)
except Exception as e:
print(f"\n⚠️ Fehler bei {user_msg['content']!r}: {e}")
# fallback: reuse original text
paraphrases = [user_msg["content"]] * n_variants
for p in paraphrases:
out_f.write(json.dumps({"role": "user", "content": p}, ensure_ascii=False) + "\n")
out_f.write(json.dumps(assistant_msg, ensure_ascii=False) + "\n")
i += 2
pbar.update(1)
time.sleep(0.05) # gentle ratelimit
out_f.close()
print(f"{output_path} fertig {(len(messages)//2)*(1+n_variants)} Zeilen")
# ---------- MAIN ENTRY ----------
def main():
import argparse
parser = argparse.ArgumentParser(
description="Generate paraphrases for a JSONL of user/assistant pairs using multiple models."
)
parser.add_argument("--input", type=Path, default="refusal_questions.jsonl",
help="Input file (JSON array of messages)")
parser.add_argument("--out-dir", type=Path, default="./paraphrases",
help="Directory to write the permodel JSONL files")
args = parser.parse_args()
# Ensure output directory exists
args.out_dir.mkdir(parents=True, exist_ok=True)
# Load all messages once works for < 1M lines comfortably
print(f"Loading {args.input}")
with args.input.open("r", encoding="utf-8") as f:
messages = json.load(f) # expects a JSON array
# Iterate over the models automatically
for model_name in MODELS:
output_path = args.out_dir / f"{model_name}.jsonl"
process_model(model_name, output_path, messages)
if __name__ == "__main__":
main()