import json, re from pathlib import Path from tqdm.auto import tqdm def dedupe(input_file: Path, output_file: Path): print(f"\n>>> Deduping {input_file}") seen = set() out_f = output_file.open("w", encoding="utf-8") # Wir laden die Datei Zeile für Zeile (JSONL) i = 0 with input_file.open("r", encoding="utf-8") as f: for line in tqdm(f, desc="reading"): msg = json.loads(line) if msg["role"] != "user": continue key = re.sub(r"\s+", " ", msg["content"]).strip().lower() if key not in seen: seen.add(key) out_f.write(json.dumps(msg, ensure_ascii=False) + "\n") # nächster Zeile ist der entsprechende Assistant‑Message next_line = f.readline() if next_line: out_f.write(next_line.rstrip("\n") + "\n") out_f.close() print(f"✅ {output_file} – {len(seen)} einzigartige User‑Phrasen.")