Add Align & Mine event dataset and loot table, implement deduplication and paraphrasing scripts

- Created a new JSONL file for the Align & Mine event dataset containing user-assistant interactions.
- Added a loot table JSONL file detailing various items and their acquisition methods.
- Implemented a deduplication script to filter unique user messages from a dataset.
- Developed a paraphrasing script to generate variations of user messages using multiple models.
This commit is contained in:
2025-08-23 12:58:39 +02:00
parent e9d6947880
commit 06086f05b3
5 changed files with 976 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
import json, re
from pathlib import Path
from tqdm.auto import tqdm
def dedupe(input_file: Path, output_file: Path):
print(f"\n>>> Deduping {input_file}")
seen = set()
out_f = output_file.open("w", encoding="utf-8")
# Wir laden die Datei Zeile für Zeile (JSONL)
i = 0
with input_file.open("r", encoding="utf-8") as f:
for line in tqdm(f, desc="reading"):
msg = json.loads(line)
if msg["role"] != "user":
continue
key = re.sub(r"\s+", " ", msg["content"]).strip().lower()
if key not in seen:
seen.add(key)
out_f.write(json.dumps(msg, ensure_ascii=False) + "\n")
# nächster Zeile ist der entsprechende AssistantMessage
next_line = f.readline()
if next_line:
out_f.write(next_line.rstrip("\n") + "\n")
out_f.close()
print(f"{output_file} {len(seen)} einzigartige UserPhrasen.")