Add Align & Mine event dataset and loot table, implement deduplication and paraphrasing scripts
- Created a new JSONL file for the Align & Mine event dataset containing user-assistant interactions. - Added a loot table JSONL file detailing various items and their acquisition methods. - Implemented a deduplication script to filter unique user messages from a dataset. - Developed a paraphrasing script to generate variations of user messages using multiple models.
This commit is contained in:
28
llm_training/deduplicate.py
Normal file
28
llm_training/deduplicate.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import json, re
|
||||
from pathlib import Path
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
def dedupe(input_file: Path, output_file: Path):
|
||||
print(f"\n>>> Deduping {input_file}")
|
||||
seen = set()
|
||||
out_f = output_file.open("w", encoding="utf-8")
|
||||
|
||||
# Wir laden die Datei Zeile für Zeile (JSONL)
|
||||
i = 0
|
||||
with input_file.open("r", encoding="utf-8") as f:
|
||||
for line in tqdm(f, desc="reading"):
|
||||
msg = json.loads(line)
|
||||
if msg["role"] != "user":
|
||||
continue
|
||||
|
||||
key = re.sub(r"\s+", " ", msg["content"]).strip().lower()
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
out_f.write(json.dumps(msg, ensure_ascii=False) + "\n")
|
||||
# nächster Zeile ist der entsprechende Assistant‑Message
|
||||
next_line = f.readline()
|
||||
if next_line:
|
||||
out_f.write(next_line.rstrip("\n") + "\n")
|
||||
|
||||
out_f.close()
|
||||
print(f"✅ {output_file} – {len(seen)} einzigartige User‑Phrasen.")
|
||||
Reference in New Issue
Block a user