- Created a new JSONL file for the Align & Mine event dataset containing user-assistant interactions. - Added a loot table JSONL file detailing various items and their acquisition methods. - Implemented a deduplication script to filter unique user messages from a dataset. - Developed a paraphrasing script to generate variations of user messages using multiple models.
29 lines
999 B
Python
29 lines
999 B
Python
import json, re
|
||
from pathlib import Path
|
||
from tqdm.auto import tqdm
|
||
|
||
def dedupe(input_file: Path, output_file: Path):
|
||
print(f"\n>>> Deduping {input_file}")
|
||
seen = set()
|
||
out_f = output_file.open("w", encoding="utf-8")
|
||
|
||
# Wir laden die Datei Zeile für Zeile (JSONL)
|
||
i = 0
|
||
with input_file.open("r", encoding="utf-8") as f:
|
||
for line in tqdm(f, desc="reading"):
|
||
msg = json.loads(line)
|
||
if msg["role"] != "user":
|
||
continue
|
||
|
||
key = re.sub(r"\s+", " ", msg["content"]).strip().lower()
|
||
if key not in seen:
|
||
seen.add(key)
|
||
out_f.write(json.dumps(msg, ensure_ascii=False) + "\n")
|
||
# nächster Zeile ist der entsprechende Assistant‑Message
|
||
next_line = f.readline()
|
||
if next_line:
|
||
out_f.write(next_line.rstrip("\n") + "\n")
|
||
|
||
out_f.close()
|
||
print(f"✅ {output_file} – {len(seen)} einzigartige User‑Phrasen.")
|