"""Build fig2b_invariance.svg — standalone view-invariance figure.

Shows a single LIBERO observer render with two camera frustums from
different poses converging on the same 3D target. Each ray is labeled
with its depth, and the vertical drop to the ground is labeled with
the constant height z*.

Uses the rendered PNG + metadata produced by render_fig2b_two_frustums.py.
"""

import base64
import json
import time

_t = time.time()

def b64(p):
    return base64.b64encode(open(p, "rb").read()).decode()


ASSETS = "/data/cameron/penpot/figures/extracted"
two_frustums_b64 = b64(f"{ASSETS}/fig2v3/two_frustums.png")
with open(f"{ASSETS}/fig2v3/two_frustums_meta.json") as f:
    meta = json.load(f)

# Canvas + image placement. Image at a prominent, readable size.
CANVAS_W = 820
CANVAS_H = 880
IMG_W = 720
IMG_H = 720
IMG_X = (CANVAS_W - IMG_W) // 2   # 50
IMG_Y = 60
scale = IMG_W / meta["image_size"]


def to_svg(pt):
    return (IMG_X + pt[0] * scale, IMG_Y + pt[1] * scale)


camA_x, camA_y     = to_svg(meta["cam_A_2d"])
camB_x, camB_y     = to_svg(meta["cam_B_2d"])
tgt_x, tgt_y       = to_svg(meta["target_2d"])
tot_x, tot_y       = to_svg(meta["target_on_table_2d"])
rayA_mx, rayA_my   = to_svg(meta["ray_A_mid_2d"])
rayB_mx, rayB_my   = to_svg(meta["ray_B_mid_2d"])
hgt_mx, hgt_my     = to_svg(meta["height_mid_2d"])

depth_A = f'{meta["depth_A"]:.2f} m'
depth_B = f'{meta["depth_B"]:.2f} m'
height_z = f'{meta["height_z"]:.2f} m'

GREEN = "#16653a"
GRAY = "#6b7280"
SLATE = "#334155"
LIGHT = "#e5e7eb"
BLUE = "#2882eb"
ORANGE = "#eb641c"
RED = "#d72d32"

svg = f'''<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
     viewBox="0 0 {CANVAS_W} {CANVAS_H}" width="{CANVAS_W}" height="{CANVAS_H}"
     font-family="Inter, Arial, sans-serif">
  <defs>
    <filter id="card-shadow" x="-10%" y="-10%" width="120%" height="130%">
      <feDropShadow dx="0" dy="1.5" stdDeviation="3" flood-color="#000" flood-opacity="0.08"/>
    </filter>
  </defs>

  <rect width="{CANVAS_W}" height="{CANVAS_H}" fill="#ffffff"/>

  <text x="30" y="34" font-size="15" font-weight="700" fill="{GRAY}" letter-spacing="0.02em">
    Height prediction is view-invariant
  </text>
  <line x1="30" y1="43" x2="{CANVAS_W - 30}" y2="43" stroke="{LIGHT}" stroke-width="1"/>

  <!-- Framed image -->
  <rect x="{IMG_X - 6}" y="{IMG_Y - 6}" width="{IMG_W + 12}" height="{IMG_H + 12}" rx="14"
        fill="#ffffff" stroke="{GREEN}" stroke-width="2.5" filter="url(#card-shadow)"/>
  <clipPath id="clip-pb"><rect x="{IMG_X}" y="{IMG_Y}" width="{IMG_W}" height="{IMG_H}" rx="8"/></clipPath>
  <image xlink:href="data:image/png;base64,{two_frustums_b64}"
         x="{IMG_X}" y="{IMG_Y}" width="{IMG_W}" height="{IMG_H}"
         preserveAspectRatio="xMidYMid slice" clip-path="url(#clip-pb)"/>

  <!-- Camera labels -->
  <text x="{camA_x - 28}" y="{camA_y + 44}" text-anchor="end" font-size="16" font-weight="800" fill="{BLUE}">camera A</text>
  <text x="{camB_x + 30}" y="{camB_y - 10}" font-size="16" font-weight="800" fill="{ORANGE}">camera B</text>

  <!-- Depth A callout -->
  <rect x="{rayA_mx - 86}" y="{rayA_my - 70}" width="150" height="30" rx="6"
        fill="#ffffff" stroke="{BLUE}" stroke-width="2.2"/>
  <text x="{rayA_mx - 11}" y="{rayA_my - 49}" text-anchor="middle" font-size="16" font-weight="800" fill="{BLUE}">
    depth = {depth_A}
  </text>
  <line x1="{rayA_mx - 11}" y1="{rayA_my - 40}" x2="{rayA_mx - 1}" y2="{rayA_my - 5}"
        stroke="{BLUE}" stroke-width="1.6"/>

  <!-- Depth B callout -->
  <rect x="{rayB_mx + 34}" y="{rayB_my - 16}" width="150" height="30" rx="6"
        fill="#ffffff" stroke="{ORANGE}" stroke-width="2.2"/>
  <text x="{rayB_mx + 109}" y="{rayB_my + 5}" text-anchor="middle" font-size="16" font-weight="800" fill="{ORANGE}">
    depth = {depth_B}
  </text>
  <line x1="{rayB_mx + 32}" y1="{rayB_my - 2}" x2="{rayB_mx + 6}" y2="{rayB_my + 2}"
        stroke="{ORANGE}" stroke-width="1.6"/>

  <!-- Height bracket callout -->
  <rect x="{tot_x + 38}" y="{hgt_my - 11}" width="178" height="30" rx="6"
        fill="#ffffff" stroke="{GREEN}" stroke-width="2.2"/>
  <text x="{tot_x + 127}" y="{hgt_my + 10}" text-anchor="middle" font-size="16" font-weight="800" fill="{GREEN}">
    height z* = {height_z}
  </text>
  <line x1="{tot_x + 36}" y1="{hgt_my + 4}" x2="{tot_x + 6}" y2="{hgt_my + 4}"
        stroke="{GREEN}" stroke-width="1.6"/>

  <!-- 3D target label -->
  <text x="{tgt_x - 26}" y="{tgt_y - 20}" text-anchor="end" font-size="15" font-weight="800" fill="{GREEN}">3D target</text>
  <line x1="{tgt_x - 24}" y1="{tgt_y - 16}" x2="{tgt_x - 6}" y2="{tgt_y - 4}"
        stroke="{GREEN}" stroke-width="1.6"/>

  <!-- Summary caption below the image -->
  <text x="{CANVAS_W / 2}" y="{IMG_Y + IMG_H + 40}" text-anchor="middle" font-size="15" font-weight="600" fill="{SLATE}">
    Same 3D target, very different depths
    (<tspan fill="{BLUE}" font-weight="800">{depth_A}</tspan>
    vs <tspan fill="{ORANGE}" font-weight="800">{depth_B}</tspan>)
    — but the <tspan fill="{GREEN}" font-weight="800">height z*</tspan> to ground is identical.
  </text>
  <text x="{CANVAS_W / 2}" y="{IMG_Y + IMG_H + 62}" text-anchor="middle" font-size="14" font-weight="700" fill="{GREEN}" font-style="italic">
    PARA predicts height directly → view-invariant.
  </text>
</svg>
'''

out = "/data/cameron/para/paper/figs/svg/fig2b_invariance.svg"
with open(out, "w") as f:
    f.write(svg)
print(f"[{time.time()-_t:.2f}s] wrote {out} ({len(svg)} bytes)")
