#!/usr/bin/env python3 """ build_blocks_db.py ------------------ Reads sample_input.xlsx (State, District Name, Block Name, Latitude, Longitude, PIN Code) and produces blocks_db.json in the schema the patched block_verify.php expects: { "blocks": { "": { "st": "", # NEW field "d": "", "b": "", "lat": , "lon": , "c": "low|medium|high", "s": "seed|manual|osm|none", "m": "", "pin": "" # carried for reference }, ... }, "state_centres": { # Mean of seed coords per state. JS uses this as "": [lat, lon] # fallback map centre when a state is selected. }, "meta": { "generated": "", "source": "sample_input.xlsx" } } Cleanup rules (applied in order): 1. Drop rows where State, District, or Block is empty / None. 2. Drop rows where State is one of the stray-header values: "State", "State Name". 3. Normalize state spellings: Chattisgarh -> Chhattisgarh Jammu & Kashmir -> Jammu and Kashmir (UT) New Delhi -> Delhi (NCT) 4. Trim whitespace on all string fields. 5. Validate lat/lon: numeric and inside India bbox (lat 6..37, lon 68..98). Out-of-range or non-numeric coords -> null + status='none'. """ import json import re import sys import unicodedata from collections import defaultdict from datetime import datetime, timezone from pathlib import Path import openpyxl # ---- Config ----------------------------------------------------------------- SRC = Path("/home/claude/sample_input.xlsx") OUT_JSON = Path("/home/claude/blocks_db.json") OUT_REPORT = Path("/home/claude/cleanup_report.md") STATE_ALIASES = { "chattisgarh": "Chhattisgarh", "jammu & kashmir": "Jammu and Kashmir (UT)", "new delhi": "Delhi (NCT)", } GARBAGE_STATES = {"state", "state name", ""} INDIA_BBOX = (6.0, 37.0, 68.0, 98.0) # min_lat, max_lat, min_lon, max_lon def slugify(s: str) -> str: """Stable lowercase slug for keys: alnum + underscores.""" s = unicodedata.normalize("NFKD", s) s = s.encode("ascii", "ignore").decode("ascii") s = re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_").lower() return s or "unknown" def cell_str(v) -> str: if v is None: return "" return str(v).strip() def parse_coord(v): """Return float or None if not a finite number.""" if v is None or v == "": return None try: f = float(v) except (TypeError, ValueError): return None if f != f: # NaN return None return f def normalize_state(raw: str) -> str: return STATE_ALIASES.get(raw.strip().lower(), raw.strip()) # ---- Read & clean ----------------------------------------------------------- wb = openpyxl.load_workbook(SRC, data_only=True) ws = wb["Blocks"] raw_rows = list(ws.iter_rows(min_row=2, values_only=True)) stats = { "total_input_rows": len(raw_rows), "dropped_blank": 0, "dropped_header_state": 0, "dropped_missing_state_or_district": 0, "synthesized_block_from_district": 0, "state_renamed": defaultdict(int), "coord_invalid_or_missing": 0, "coord_out_of_india": 0, "duplicate_keys_appended": 0, "kept": 0, } blocks = {} seen_keys = set() # For per-state centroid (only count valid coords) state_lat_sum = defaultdict(float) state_lon_sum = defaultdict(float) state_coord_count = defaultdict(int) for raw in raw_rows: state, dist, block, lat, lon, pin = (raw + (None,) * 6)[:6] state = cell_str(state) dist = cell_str(dist) block = cell_str(block) pin = cell_str(pin) # Rule 1: fully blank if not state and not dist and not block: stats["dropped_blank"] += 1 continue # Rule 2: stray header rows if state.strip().lower() in GARBAGE_STATES: stats["dropped_header_state"] += 1 continue # Rule 1b: must have state and district (block is optional — see below) if not state or not dist: stats["dropped_missing_state_or_district"] += 1 continue # If block is missing, this row represents the district HQ itself — # keep it but synthesize a block name so it shows up in the verifier. note = "" if not block: block = f"{dist} (District HQ)" note = "District HQ entry — no separate block listed in source" stats["synthesized_block_from_district"] += 1 # Rule 3: state alias norm_state = normalize_state(state) if norm_state != state: stats["state_renamed"][f"{state} -> {norm_state}"] += 1 # Coords flat = parse_coord(lat) flon = parse_coord(lon) confidence = "low" source = "seed" if flat is None or flon is None: stats["coord_invalid_or_missing"] += 1 flat = None flon = None source = "none" confidence = "low" else: mn_lat, mx_lat, mn_lon, mx_lon = INDIA_BBOX if not (mn_lat <= flat <= mx_lat and mn_lon <= flon <= mx_lon): stats["coord_out_of_india"] += 1 flat = None flon = None source = "none" confidence = "low" else: # Round seed coords to 6 decimals to keep JSON tidy flat = round(flat, 6) flon = round(flon, 6) confidence = "medium" # Stable key: state__district__block (slug) key = f"{slugify(norm_state)}__{slugify(dist)}__{slugify(block)}" if key in seen_keys: # If duplicate key, append a counter so we don't lose the row. i = 2 while f"{key}__{i}" in seen_keys: i += 1 key = f"{key}__{i}" stats["duplicate_keys_appended"] += 1 seen_keys.add(key) blocks[key] = { "st": norm_state, "d": dist, "b": block, "lat": flat, "lon": flon, "c": confidence, "s": source, "m": note, "pin": pin, } if flat is not None and flon is not None: state_lat_sum[norm_state] += flat state_lon_sum[norm_state] += flon state_coord_count[norm_state] += 1 stats["kept"] += 1 # ---- Per-state centroid ----------------------------------------------------- state_centres = {} for st, n in state_coord_count.items(): if n > 0: state_centres[st] = [ round(state_lat_sum[st] / n, 5), round(state_lon_sum[st] / n, 5), ] # ---- Write JSON ------------------------------------------------------------- db = { "meta": { "generated": datetime.now(timezone.utc).isoformat(timespec="seconds"), "source": "sample_input.xlsx", "schema_version": 2, }, "state_centres": state_centres, "blocks": blocks, } OUT_JSON.write_text(json.dumps(db, ensure_ascii=False, indent=2), encoding="utf-8") # ---- Cleanup report --------------------------------------------------------- lines = [] lines.append("# blocks_db.json — Cleanup Report\n") lines.append(f"_Generated: {db['meta']['generated']}_\n") lines.append(f"_Source: `{SRC.name}`_\n\n") lines.append("## Summary\n") lines.append(f"- Input rows scanned: **{stats['total_input_rows']}**") lines.append(f"- Blocks kept: **{stats['kept']}**") lines.append( f"- Rows dropped: **{stats['dropped_blank'] + stats['dropped_header_state'] + stats['dropped_missing_state_or_district']}**" ) lines.append(f" - fully blank rows: {stats['dropped_blank']}") lines.append(f" - stray header rows (State='State' / 'State Name'): {stats['dropped_header_state']}") lines.append(f" - missing state or district name: {stats['dropped_missing_state_or_district']}") lines.append(f"- Rows where block name was synthesized from district (district-HQ entries): **{stats['synthesized_block_from_district']}**") lines.append(f"- Coordinate issues (kept block, cleared coords): **{stats['coord_invalid_or_missing'] + stats['coord_out_of_india']}**") lines.append(f" - missing/non-numeric: {stats['coord_invalid_or_missing']}") lines.append(f" - outside India bbox: {stats['coord_out_of_india']}") lines.append(f"- Duplicate keys (suffixed with __N): {stats['duplicate_keys_appended']}\n") lines.append("## State name normalizations\n") if stats["state_renamed"]: lines.append("| Original | Normalized | Rows affected |") lines.append("|---|---|---|") for k, v in sorted(stats["state_renamed"].items()): orig, _, new = k.partition(" -> ") lines.append(f"| {orig} | {new} | {v} |") else: lines.append("_None_") lines.append("") # Final state counts final_counts = defaultdict(int) final_with_coords = defaultdict(int) for b in blocks.values(): final_counts[b["st"]] += 1 if b["lat"] is not None: final_with_coords[b["st"]] += 1 lines.append("## Final state breakdown (after cleanup)\n") lines.append(f"_{len(final_counts)} unique states/UTs_\n") lines.append("| State | Blocks | With seed coords |") lines.append("|---|---:|---:|") for st in sorted(final_counts.keys()): lines.append(f"| {st} | {final_counts[st]} | {final_with_coords[st]} |") lines.append("") OUT_REPORT.write_text("\n".join(lines), encoding="utf-8") # Console summary print(f"Wrote {OUT_JSON} ({len(blocks)} blocks across {len(final_counts)} states)") print(f"Wrote {OUT_REPORT}") print() print("Cleanup summary:") for k, v in stats.items(): if k == "state_renamed": for kk, vv in v.items(): print(f" rename: {kk} ({vv} rows)") else: print(f" {k}: {v}")