import csv
import json

VALID_ENTITIES = {
    # global
    "ADRESSE", "VILLE", "PRIX_VENTE", "SUPERFICIE_HAB", "NB_NIVEAUX",
    "NB_PIECES", "SUPERFICIE_TERRAIN", "ANNEE_CONSTRUCTION", "STATUT_PROPRIETE",
    "TYPE_BIEN", "DPE", "GES", "CHARGES_IMMEUBLE_MOIS", "TAXE_FONCIERE",
    # lot
    "LOT_NOMBRE", "LOT_ETAGE", "LOT_SUPERFICIE_UNIT", "LOT_SUPERFICIE_TOTAL",
    "LOT_LOYER", "LOT_CHARGES", "LOT_TYPOLOGIE", "LOT_BAIL", "LOT_DPE", "LOT_OCCUPATION",
}

VALID_LABELS = {"O"} | {
    f"{prefix}-{entity}"
    for prefix in ("B", "I")
    for entity in VALID_ENTITIES
}


def normalize_label(label: str) -> str:
    return label if label in VALID_LABELS else "O"


input_path = "dataset.csv"
output_path = "dataset2.csv"

rows_processed = 0
rows_skipped = 0
labels_replaced = 0

with open(input_path, encoding="utf-8", newline="") as fin, \
     open(output_path, "w", encoding="utf-8", newline="") as fout:

    reader = csv.reader(fin)
    writer = csv.writer(fout)

    header = next(reader)
    writer.writerow(header)

    for i, row in enumerate(reader):
        if len(row) < 2 or not row[1].strip():
            rows_skipped += 1
            writer.writerow(row)
            continue

        try:
            data = json.loads(row[1])
        except json.JSONDecodeError as e:
            print(f"Ligne {i+2} JSON invalide, ignorée : {e}")
            rows_skipped += 1
            writer.writerow(row)
            continue

        original_labels = data.get("labels", [])
        new_labels = []
        for label in original_labels:
            normalized = normalize_label(label)
            if normalized != label:
                labels_replaced += 1
            new_labels.append(normalized)

        data["labels"] = new_labels
        writer.writerow([row[0], json.dumps(data, ensure_ascii=False)])
        rows_processed += 1

print(f"Terminé.")
print(f"  Lignes traitées : {rows_processed}")
print(f"  Lignes ignorées : {rows_skipped}")
print(f"  Labels remplacés par O : {labels_replaced}")
print(f"  Fichier de sortie : {output_path}")
