#!/usr/bin/env python3
"""
build_blocks_db.py
------------------
Reads sample_input.xlsx (State, District Name, Block Name, Latitude, Longitude, PIN Code)
and produces blocks_db.json in the schema the patched block_verify.php expects:

  {
    "blocks": {
      "<key>": {
        "st": "<state>",        # NEW field
        "d":  "<district>",
        "b":  "<block>",
        "lat": <float|null>,
        "lon": <float|null>,
        "c":  "low|medium|high",
        "s":  "seed|manual|osm|none",
        "m":  "<note>",
        "pin": "<pin>"          # carried for reference
      }, ...
    },
    "state_centres": {           # Mean of seed coords per state. JS uses this as
        "<state>": [lat, lon]    # fallback map centre when a state is selected.
    },
    "meta": { "generated": "<iso>", "source": "sample_input.xlsx" }
  }

Cleanup rules (applied in order):
  1. Drop rows where State, District, or Block is empty / None.
  2. Drop rows where State is one of the stray-header values: "State", "State Name".
  3. Normalize state spellings:
       Chattisgarh           -> Chhattisgarh
       Jammu & Kashmir       -> Jammu and Kashmir (UT)
       New Delhi             -> Delhi (NCT)
  4. Trim whitespace on all string fields.
  5. Validate lat/lon: numeric and inside India bbox (lat 6..37, lon 68..98).
     Out-of-range or non-numeric coords -> null + status='none'.
"""

import json
import re
import sys
import unicodedata
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path

import openpyxl

# ---- Config -----------------------------------------------------------------
SRC = Path("/home/claude/sample_input.xlsx")
OUT_JSON = Path("/home/claude/blocks_db.json")
OUT_REPORT = Path("/home/claude/cleanup_report.md")

STATE_ALIASES = {
    "chattisgarh": "Chhattisgarh",
    "jammu & kashmir": "Jammu and Kashmir (UT)",
    "new delhi": "Delhi (NCT)",
}

GARBAGE_STATES = {"state", "state name", ""}

INDIA_BBOX = (6.0, 37.0, 68.0, 98.0)  # min_lat, max_lat, min_lon, max_lon


def slugify(s: str) -> str:
    """Stable lowercase slug for keys: alnum + underscores."""
    s = unicodedata.normalize("NFKD", s)
    s = s.encode("ascii", "ignore").decode("ascii")
    s = re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_").lower()
    return s or "unknown"


def cell_str(v) -> str:
    if v is None:
        return ""
    return str(v).strip()


def parse_coord(v):
    """Return float or None if not a finite number."""
    if v is None or v == "":
        return None
    try:
        f = float(v)
    except (TypeError, ValueError):
        return None
    if f != f:  # NaN
        return None
    return f


def normalize_state(raw: str) -> str:
    return STATE_ALIASES.get(raw.strip().lower(), raw.strip())


# ---- Read & clean -----------------------------------------------------------
wb = openpyxl.load_workbook(SRC, data_only=True)
ws = wb["Blocks"]

raw_rows = list(ws.iter_rows(min_row=2, values_only=True))

stats = {
    "total_input_rows": len(raw_rows),
    "dropped_blank": 0,
    "dropped_header_state": 0,
    "dropped_missing_state_or_district": 0,
    "synthesized_block_from_district": 0,
    "state_renamed": defaultdict(int),
    "coord_invalid_or_missing": 0,
    "coord_out_of_india": 0,
    "duplicate_keys_appended": 0,
    "kept": 0,
}

blocks = {}
seen_keys = set()

# For per-state centroid (only count valid coords)
state_lat_sum = defaultdict(float)
state_lon_sum = defaultdict(float)
state_coord_count = defaultdict(int)

for raw in raw_rows:
    state, dist, block, lat, lon, pin = (raw + (None,) * 6)[:6]
    state = cell_str(state)
    dist = cell_str(dist)
    block = cell_str(block)
    pin = cell_str(pin)

    # Rule 1: fully blank
    if not state and not dist and not block:
        stats["dropped_blank"] += 1
        continue

    # Rule 2: stray header rows
    if state.strip().lower() in GARBAGE_STATES:
        stats["dropped_header_state"] += 1
        continue

    # Rule 1b: must have state and district (block is optional — see below)
    if not state or not dist:
        stats["dropped_missing_state_or_district"] += 1
        continue

    # If block is missing, this row represents the district HQ itself —
    # keep it but synthesize a block name so it shows up in the verifier.
    note = ""
    if not block:
        block = f"{dist} (District HQ)"
        note = "District HQ entry — no separate block listed in source"
        stats["synthesized_block_from_district"] += 1

    # Rule 3: state alias
    norm_state = normalize_state(state)
    if norm_state != state:
        stats["state_renamed"][f"{state} -> {norm_state}"] += 1

    # Coords
    flat = parse_coord(lat)
    flon = parse_coord(lon)
    confidence = "low"
    source = "seed"
    if flat is None or flon is None:
        stats["coord_invalid_or_missing"] += 1
        flat = None
        flon = None
        source = "none"
        confidence = "low"
    else:
        mn_lat, mx_lat, mn_lon, mx_lon = INDIA_BBOX
        if not (mn_lat <= flat <= mx_lat and mn_lon <= flon <= mx_lon):
            stats["coord_out_of_india"] += 1
            flat = None
            flon = None
            source = "none"
            confidence = "low"
        else:
            # Round seed coords to 6 decimals to keep JSON tidy
            flat = round(flat, 6)
            flon = round(flon, 6)
            confidence = "medium"

    # Stable key: state__district__block (slug)
    key = f"{slugify(norm_state)}__{slugify(dist)}__{slugify(block)}"
    if key in seen_keys:
        # If duplicate key, append a counter so we don't lose the row.
        i = 2
        while f"{key}__{i}" in seen_keys:
            i += 1
        key = f"{key}__{i}"
        stats["duplicate_keys_appended"] += 1
    seen_keys.add(key)

    blocks[key] = {
        "st": norm_state,
        "d": dist,
        "b": block,
        "lat": flat,
        "lon": flon,
        "c": confidence,
        "s": source,
        "m": note,
        "pin": pin,
    }
    if flat is not None and flon is not None:
        state_lat_sum[norm_state] += flat
        state_lon_sum[norm_state] += flon
        state_coord_count[norm_state] += 1
    stats["kept"] += 1

# ---- Per-state centroid -----------------------------------------------------
state_centres = {}
for st, n in state_coord_count.items():
    if n > 0:
        state_centres[st] = [
            round(state_lat_sum[st] / n, 5),
            round(state_lon_sum[st] / n, 5),
        ]

# ---- Write JSON -------------------------------------------------------------
db = {
    "meta": {
        "generated": datetime.now(timezone.utc).isoformat(timespec="seconds"),
        "source": "sample_input.xlsx",
        "schema_version": 2,
    },
    "state_centres": state_centres,
    "blocks": blocks,
}

OUT_JSON.write_text(json.dumps(db, ensure_ascii=False, indent=2), encoding="utf-8")

# ---- Cleanup report ---------------------------------------------------------
lines = []
lines.append("# blocks_db.json — Cleanup Report\n")
lines.append(f"_Generated: {db['meta']['generated']}_\n")
lines.append(f"_Source: `{SRC.name}`_\n\n")

lines.append("## Summary\n")
lines.append(f"- Input rows scanned: **{stats['total_input_rows']}**")
lines.append(f"- Blocks kept: **{stats['kept']}**")
lines.append(
    f"- Rows dropped: **{stats['dropped_blank'] + stats['dropped_header_state'] + stats['dropped_missing_state_or_district']}**"
)
lines.append(f"  - fully blank rows: {stats['dropped_blank']}")
lines.append(f"  - stray header rows (State='State' / 'State Name'): {stats['dropped_header_state']}")
lines.append(f"  - missing state or district name: {stats['dropped_missing_state_or_district']}")
lines.append(f"- Rows where block name was synthesized from district (district-HQ entries): **{stats['synthesized_block_from_district']}**")
lines.append(f"- Coordinate issues (kept block, cleared coords): **{stats['coord_invalid_or_missing'] + stats['coord_out_of_india']}**")
lines.append(f"  - missing/non-numeric: {stats['coord_invalid_or_missing']}")
lines.append(f"  - outside India bbox: {stats['coord_out_of_india']}")
lines.append(f"- Duplicate keys (suffixed with __N): {stats['duplicate_keys_appended']}\n")

lines.append("## State name normalizations\n")
if stats["state_renamed"]:
    lines.append("| Original | Normalized | Rows affected |")
    lines.append("|---|---|---|")
    for k, v in sorted(stats["state_renamed"].items()):
        orig, _, new = k.partition(" -> ")
        lines.append(f"| {orig} | {new} | {v} |")
else:
    lines.append("_None_")
lines.append("")

# Final state counts
final_counts = defaultdict(int)
final_with_coords = defaultdict(int)
for b in blocks.values():
    final_counts[b["st"]] += 1
    if b["lat"] is not None:
        final_with_coords[b["st"]] += 1

lines.append("## Final state breakdown (after cleanup)\n")
lines.append(f"_{len(final_counts)} unique states/UTs_\n")
lines.append("| State | Blocks | With seed coords |")
lines.append("|---|---:|---:|")
for st in sorted(final_counts.keys()):
    lines.append(f"| {st} | {final_counts[st]} | {final_with_coords[st]} |")
lines.append("")

OUT_REPORT.write_text("\n".join(lines), encoding="utf-8")

# Console summary
print(f"Wrote {OUT_JSON}  ({len(blocks)} blocks across {len(final_counts)} states)")
print(f"Wrote {OUT_REPORT}")
print()
print("Cleanup summary:")
for k, v in stats.items():
    if k == "state_renamed":
        for kk, vv in v.items():
            print(f"  rename: {kk}  ({vv} rows)")
    else:
        print(f"  {k}: {v}")
