\documentclass[11pt]{article}

\usepackage[margin=1in]{geometry}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{amsmath, amssymb}
\usepackage{hyperref}
\usepackage{xcolor}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{multirow}
\usepackage{authblk}
\usepackage{microtype}

\hypersetup{colorlinks=true, linkcolor=blue!50!black, citecolor=blue!50!black, urlcolor=blue!50!black}

\title{Image Augmentations for Viewpoint and Spatial Generalization \\
       in ACT-Based Robot Manipulation}
\author{Cameron Smith}
\affil{University of Southern California \\ CSCI 567 Final Project}
\date{May 2026}

\begin{document}
\maketitle

\begin{abstract}
ACT-style behavior cloning policies overfit to the training camera viewpoint,
with success collapsing under even modest viewpoint shifts. We investigate
whether classical 2D image augmentations -- random crop, perspective warp,
rotation, and shear -- can bridge the gap between a small number of real
training viewpoints and a continuous test-time viewpoint distribution on a
LIBERO bowl-on-plate task. We train three ACT models on identical
multi-viewpoint translation data (50 demos across 5 camera positions) under
three augmentation regimes -- none, crop only, and all-aug -- and evaluate
each on two structured OOD grids: a 5$\times$5 camera-translation grid
scored with a \texttt{miss}/\texttt{grasp}/\texttt{place} metric, and an
8$\times$8 spherical camera-rotation grid (a rotation-OOD axis the models
never saw at training). Crop augmentation gives a 5$\times$ lift on
translation OOD (10 vs.\ 2 grasp+ out of 25) and is also the single best
configuration on the rotation grid (19\% overall vs.\ 6\% no-aug), winning
on small rotations that locally resemble translation. All-aug trades
in-distribution accuracy for broader mid-range rotation coverage (37\% at
$\theta\!=\!7.1^\circ$ vs.\ crop's 33\%; 21\% at $\theta\!=\!0^\circ$ vs.\
crop's 63\%). The dominant lesson is that augmentation is essential to
bridge the gaps between discrete training viewpoints, and the right
augmentation depends on the OOD axis being evaluated.
\end{abstract}

\section{Introduction}

End-to-end visuomotor policies trained from a single, fixed camera position
tend to bake the camera extrinsics into their internal representation. Action
Chunking with Transformers \cite{zhao2023learning} (ACT) is a popular
architecture for low-data behavior cloning, but its CLS-token-based image
representation is not viewpoint-invariant: small camera rotations or
translations change every pixel, and the policy must learn to associate the
new image with the same world-frame action. Without explicit invariance
training, the failure mode is severe -- our baseline drops from 67\% success
at the training viewpoint to a 20\% mean over an 8$\times$8 rotation grid.

\paragraph{Research questions.} We investigate three concrete questions:
\textbf{(Q1)} Given a small number of real training viewpoints, does
augmentation meaningfully improve generalisation to nearby unseen
viewpoints, or is real viewpoint diversity sufficient on its own?
\textbf{(Q2)} Does the specific augmentation matter -- e.g., does
\emph{crop} (which emulates camera translation) generalise differently from
the rotation/shear/perspective components of an \emph{all-aug} pipeline?
\textbf{(Q3)} Do augmentations trained for one OOD axis (translation)
transfer to a different OOD axis (rotation) the model never saw during
training?

\paragraph{Why this is interesting.} Collecting viewpoint-diverse robot
demonstrations is expensive: every additional camera pose multiplies
teleoperation effort. If 2D augmentations can substantially close the gap to
real multi-view data, that is a meaningful practical win. Prior work on
view-invariant representations
\cite{sermanet2018time, james2019sim, jangir2022look} has trended toward
learned canonical features; we instead test the simplest possible
intervention -- standard data augmentation applied to a vanilla ACT policy.

\paragraph{Relation to prior work.} View-invariant policy learning has been
attacked from several angles: contrastive multi-view objectives
\cite{sermanet2018time}, cross-view consistency on transformer features
\cite{jangir2022look}, and viewpoint randomization in simulation
\cite{tobin2017domain, james2019sim}. Augmentation-as-domain-randomization is
well known in image classification \cite{cubuk2019autoaugment} but is
underexplored for action-chunking BC policies that condition on absolute 3D
target positions, where 2D image augmentations leave the targets unchanged
and are therefore ``free.'' Concurrently, real multi-view training is the de
facto solution in industry-scale work \cite{brohan2022rt1}; our contribution
is a careful, controlled comparison on a fixed task and architecture.

\section{Methods}

\paragraph{Task and model.} The task is a LIBERO \cite{liu2023libero}
pick-and-place: pick up a black bowl and place it on a plate. We use the ACT
architecture \cite{zhao2023learning} with two backbone variants: (a) DINOv2
ViT-S/16+ \cite{darcet2023vitneed} as a frozen feature extractor, and (b) a
trainable ResNet-18 \cite{he2016deep}. The 384-d CLS token is concatenated
with the current EEF position (3-d), gripper state (1-d), an optional 2-d
``start keypoint'' giving the EEF pixel projection, and a CLIP
\cite{radford2021clip} task embedding (384-d). A small MLP regresses 4-step
chunks of (a) world-frame XYZ targets, normalised to $[0,1]$, and (b)
gripper logits. Critically, the regression \emph{targets are in the world
frame}, not in pixel or camera space, so any 2D image augmentation leaves
them numerically unchanged.

\paragraph{Augmentations.} We implement augmentations directly in the
data-loader (\texttt{data.py}), wrapping each image with a random
\texttt{warpPerspective} matrix.
\begin{itemize}
\itemsep0pt
  \item \textbf{None} -- raw 448$\times$448 RGB.
  \item \textbf{Perspective} -- random horizontal/vertical perspective warp,
        $\pm 0.15$ strength. Emulates camera rotation.
  \item \textbf{Crop} -- pure pixel-space translation, $\pm 80$\,px shift,
        no resize, via \texttt{warpPerspective}. Emulates camera translation.
  \item \textbf{All} -- composite of rotation ($\pm 10^\circ$), shear
        ($\pm 0.10$), perspective ($\pm 0.10$), and crop ($\pm 35$\,px) at
        85\% scale.
\end{itemize}
Each augmentation fires with probability 0.5 per sample (a key
implementation lesson, see \S\ref{sec:process}). Figure
\ref{fig:aug-grid} shows the parameter sweep we used to pick the strengths.

\begin{figure}[t]
  \centering
  \includegraphics[width=0.95\linewidth]{figures/augmentation_grid.png}
  \caption{Augmentation parameter sweep. Each row is one augmentation type;
  columns vary the strength. We picked midpoints that visibly distorted the
  scene without warping the bowl off-table or out of frame.}
  \label{fig:aug-grid}
\end{figure}

\paragraph{Training data.} All models share the \textbf{Translation-VP}
dataset on LIBERO \texttt{spatial} task 0: 50 demos collected at 5 camera
positions (centre + four corners, $\pm 10$\,cm horizontal, $\pm 7.5$\,cm
vertical) at the default $\theta$, with the object at a fixed pose, 10
demos per camera. Demonstrations are servo-replay teleoperation
trajectories and contain $\sim$32 frames each. This dataset deliberately
exposes the policy to a small, discrete set of viewpoints; the empirical
question is whether image augmentation can fill in the continuous space
between (and beyond) these 5 cameras. The accompanying 8$\times$8
spherical rotation grid used for evaluation is a separate dataset of 640
demos used only to define camera poses, never for training.

\paragraph{Evaluation protocol.} We evaluate on two structured grids.
(a) \textbf{Rotation grid:} 8$\theta$ $\times$ 8$\phi$ = 64 viewpoints, 3
episodes per view, 192 total. (b) \textbf{Translation grid:} 5$\times$5
camera offsets, $dx \in \{-0.15, -0.075, 0, 0.075, 0.15\}$\,m, $dy \in
\{-0.10, -0.05, 0, 0.05, 0.10\}$\,m, 1 episode per cell. For translation we
score with a three-stage metric: \texttt{place} (success), \texttt{grasp}
(bowl lifted but not placed), \texttt{miss} (no grasp). The grasp+ count
(grasp $\cup$ place) is our headline metric for spatial OOD. All eval uses
closed-loop servoing (\texttt{-{}-teleport}) to the next predicted target.

\section{Results}

We train three ACT models on identical multi-viewpoint translation data
(50 demos across 5 camera positions: centre + 4 corners, $\pm 10$\,cm
horizontal, $\pm 7.5$\,cm vertical) and vary only the augmentation pipeline:
\emph{no aug}, \emph{crop 50\%}, and \emph{all aug 50\%}. All three runs
share the ResNet-18 backbone, training time, and hyperparameters; the only
variable is what arrives at the image encoder. We then evaluate every model
on two structured OOD grids: a 5$\times$5 \emph{translation} grid (camera
moved off the seen 5-position skeleton) and an 8$\times$8 spherical
\emph{rotation} grid (a different OOD axis entirely, not present in
training).

\begin{figure}[t]
  \centering
  \includegraphics[width=0.95\linewidth]{figures/distribution_overview.png}
  \caption{Evaluation viewpoint distribution. The polar plot (left) shows
  the 8$\times$8 spherical rotation grid used for the rotation OOD eval;
  the centre cluster marks the default elevation ($\theta\!=\!0$) and the
  other 56 dots are held-out cameras. The image strips show sample
  agent-view frames at training viewpoints (green border, default
  elevation, varied object positions) and held-out test viewpoints (red
  border, varying $\theta,\phi$), illustrating the visual shift the policy
  must handle.}
  \label{fig:dist}
\end{figure}

\subsection{Translation OOD: crop augmentation is essential}

\begin{table}[t]
  \centering
  \caption{5$\times$5 translation grid (1 episode per cell, 25 total) with
  three-stage scoring. ``Grasp+'' = grasp $\cup$ place. All three models are
  trained on the same 50 multi-VP translation demos; only the augmentation
  pipeline differs.}
  \label{tab:trans}
  \begin{tabular}{lrrrr}
    \toprule
    Model & Place & Grasp & Miss & \textbf{Grasp+} \\
    \midrule
    Multi-VP + No Aug         & 0 & 2 & 23 & 2 \\
    Multi-VP + All Aug (50\%) & 0 & 5 & 20 & 5 \\
    Multi-VP + Crop (50\%)    & 2 & 8 & 15 & \textbf{10} \\
    \bottomrule
  \end{tabular}
\end{table}

Table~\ref{tab:trans} shows a 5$\times$ lift from adding crop augmentation
(10 vs 2 grasp+) on the same training data. With only 5 discrete training
viewpoints, the no-aug model fails to interpolate between them: 23 of 25
grid cells are outright misses. Crop augmentation simulates continuous
camera translation in pixel space, filling in the gaps between the real
training viewpoints and giving the policy enough invariance to grasp under
cameras it has never seen. All-aug sits in the middle (5 grasp+): the
rotation/shear/perspective components distort the image along axes that are
\emph{not} the OOD axis we evaluate on here, and they appear to dilute the
crop signal. The conclusion for translation OOD is unambiguous: real
multi-view data alone is not enough -- augmentation is required to bridge
the gaps between the discrete training viewpoints.

\begin{figure}[t]
  \centering
  \includegraphics[width=0.85\linewidth]{figures/crop_vs_translation.png}
  \caption{Why crop augmentation acts as a stand-in for camera
  translation. Each panel sweeps from negative (blue) to positive (red)
  shift along one axis. Left: pixel-space crop applied to the same source
  frame. Right: the simulator rendered from a physically translated
  camera. Top row varies horizontally; bottom row vertically. The two
  produce visually near-identical frames, which is why a crop-augmented
  policy generalises across translated test cameras.}
  \label{fig:crop-trans}
\end{figure}

\subsection{Rotation OOD: cross-axis transfer is partial and aug-specific}

\begin{table}[t]
  \centering
  \caption{8$\times$8 rotation grid (3 episodes per viewpoint, 192 total)
  evaluated on the \emph{same translation-trained models} as
  Table~\ref{tab:trans}. Columns are $\theta$ (elevation, degrees), each
  averaged over all 8 azimuths $\phi$. The models were never trained on
  rotated cameras.}
  \label{tab:rot}
  \footnotesize
  \setlength{\tabcolsep}{4pt}
  \begin{tabular}{lrrrrrrrrr}
    \toprule
    Model & 0.0 & 3.6 & 7.1 & 10.7 & 14.3 & 17.9 & 21.4 & 25.0 & \textbf{Overall} \\
    \midrule
    Multi-VP + No Aug         & 12\% & 21\% & 12\% &  4\% &  0\% &  0\% &  0\% & 0\% &  6\% \\
    Multi-VP + All Aug (50\%) & 21\% &  4\% & \textbf{37\%} & \textbf{17\%} &  4\% &  0\% & \textbf{8\%} & 0\% & 11\% \\
    Multi-VP + Crop (50\%)    & \textbf{63\%} & \textbf{33\%} & 33\% &  8\% & \textbf{8\%} & \textbf{4\%} &  0\% & 0\% & \textbf{19\%} \\
    \bottomrule
  \end{tabular}
\end{table}

Table~\ref{tab:rot} is the more surprising result. The same translation-trained
models, evaluated on a rotation grid they were never exposed to, still
benefit substantially from augmentation: 19\% (crop) and 11\% (all-aug) vs
6\% (no-aug). Two patterns stand out.

\textbf{Crop dominates near the training elevation} (63\% at $\theta\!=\!0$,
33\% at $\theta\!=\!3.6$). Small camera rotations near the training pose
are locally well-approximated by pixel translation, so the crop-trained
model's lateral invariance carries over directly. At larger rotations,
where the local linearisation breaks down, crop offers no help.

\textbf{All-aug dominates at mid-range rotation} ($\theta\!=\!7.1$ to
$21.4$: 37\%, 17\%, 4\%, 8\%). The perspective and shear components of
all-aug are geometrically closer to true camera rotation, so the policy
retains some function at elevations crop cannot cover. The cost is visible
at $\theta\!=\!0$ (21\% vs.\ crop's 63\%) -- mixing in rotation-like
distortions during training trades in-distribution accuracy for OOD
coverage.

The no-aug model degrades monotonically and collapses to 0\% beyond
$\theta\!=\!14.3$. With no exposure to any 2D distortion, the policy cannot
extrapolate beyond the exact viewpoints it saw.

\subsection{Takeaways and error analysis}

The two tables together support a clean conclusion: \emph{augmentation is
essential for bridging the gaps between discrete training viewpoints, and
the right augmentation depends on the OOD axis you care about}. Crop matches
the translation grid exactly (its inductive bias is the OOD axis) and gives
the 5$\times$ lift. Crop also wins overall on the rotation grid because the
near-training-viewpoint slices are well-approximated by translation. All-aug
covers a wider OOD radius but at the cost of in-distribution sharpness. The
no-aug baseline collapses to single-digit success on either axis, confirming
that ACT's CLS-token representation does not inherently generalise across
viewpoint.

We hand-inspected $\sim$30 failure rollouts. Three patterns dominate:
(1) \emph{Wrong target localisation}: at $\theta > 15^\circ$ the predicted
3D target is offset by 5--10\,cm, suggesting the CLS token cannot factor
out elevation. (2) \emph{Premature gripper close}: the gripper-logit head
appears to fire on the bowl's apparent pixel size, which under viewpoint
shift no longer correlates with EEF--bowl distance. (3) \emph{Approach
drift in crop-augmented models at large rotations}: the translational
prior becomes a liability once the camera has rotated enough that the
projection is no longer a near-translation.

A second, surprising observation: \textbf{validation loss does not predict
sim success.} Longer training reliably lowers val loss while degrading
on-grid rollouts. We now treat val loss as a sanity check only and gate
decisions on grid eval.

\section{Process}
\label{sec:process}

The path to the final 3-way comparison in Results was not a straight line.
There were three conceptual pivots, each driven by what the eval grids
told us about the previous approach.

\textbf{Pivot 1: From ``all augmentations'' to isolating individual
augmentations.} Our first instinct was to stack everything -- random
rotation, shear, perspective, and crop -- into a single composite
``all-aug'' pipeline on the assumption that more invariance pressure could
only help. The results were poor and contradictory: the model degraded
sharply at the training viewpoint without much OOD payoff (10\% overall
when run on default-VP single-camera data). We backed off and ran each
augmentation in isolation, which is what surfaced the central asymmetry
the final results hinge on -- \emph{crop} is specifically suited to
translation OOD, while rotation/shear/perspective are specifically suited
to rotation OOD. The composite all-aug configuration still earns a place
in the final ablation, but it stopped being our default.

\textbf{Pivot 2: Curriculum cropping at single-viewpoint scale.} With a
single training viewpoint and crop augmentation, we observed a strong but
narrow effect: crop boosted in-distribution sharpness dramatically (e.g.\
88\% at the training pose) while \emph{degrading} performance away from
it. The natural fix was scheduling -- train with crop for the first half
of the run, then fine-tune without it. This curriculum gave the best
single-viewpoint translation result we obtained (11/25 grasp+), better
than either crop-only or no-aug. The takeaway was that augmentation can
be made to behave by managing its schedule, not just its strength -- a
useful idea that we ultimately set aside once Pivot 3 made it less
necessary.

\textbf{Pivot 3: Augmentations are not a substitute for real
multi-viewpoint data.} Despite the curriculum win, the absolute numbers
on the rotation grid remained discouraging from any single-viewpoint
training run. We collected the Translation-VP dataset (50 demos across
5 real camera positions) and re-ran the augmentation comparison on top
of it. The contrast was sharp: no-aug + multi-VP barely improved on
no-aug + single-VP (2 vs.\ 2 grasp+), but \emph{crop + multi-VP} produced
a 5$\times$ lift to 10 grasp+ -- substantially better than the curriculum
trick on single-VP data. The right mental model became: real multi-view
data places the policy at a sparse set of viewpoints, and image
augmentation interpolates between them. Neither alone is sufficient, and
this realisation is what set up the final experimental design in Results.

\paragraph{A useful tooling change.} A binary success rate on the
5$\times$5 translation grid was nearly useless because most cells were
outright misses; the metric flattened every gradation of partial success
into a 0. Switching to a \texttt{miss}/\texttt{grasp}/\texttt{place}
three-stage metric was the single biggest qualitative improvement to the
experimental loop: it made the differences between augmentation regimes
visible, and the heatmaps in Figure~\ref{fig:crop-trans} only become
legible under three-stage scoring.

\paragraph{A failed direction.} We spent $\approx$2 days on a long
perspective-augmentation run on the full 64-viewpoint dataset, hoping
that more iterations would bridge the rotation-OOD gap. The result was
the opposite -- a 3-hour run reached 6\% overall on the rotation grid,
well below a 10-minute run on the same setup at 26\%. We now believe
long aug runs overfit to the augmentation-induced distribution modes
rather than the underlying task, and we cap aug runs at $\sim$20--30
minutes.

\section{Contributions}

I implemented: the augmentation library in \texttt{data.py} (perspective,
crop, all-aug composite, plus the visualisation overlay
\texttt{viz\_augmentations.py}); the \texttt{-{}-augment} and
\texttt{-{}-backbone} training flags in
\texttt{train.py}/\texttt{model\_act.py}; the camera-translation eval
infrastructure (\texttt{-{}-cam\_dx/dy/dz} in \texttt{eval.py},
\texttt{eval\_full\_grid.py}, \texttt{eval\_translation\_grid.py},
\texttt{eval\_translation\_multistage\_5x5.py}) including the three-stage
\texttt{miss}/\texttt{grasp}/\texttt{place} scoring described in
\S\ref{sec:process}; and the dataset generator
\texttt{generate\_ood\_translation.py} for the Translation-VP dataset.

I used the LIBERO benchmark \cite{liu2023libero}, the ACT architecture from
\cite{zhao2023learning} (re-implemented from scratch for this codebase
rather than imported), DINOv2 weights from
\cite{darcet2023vitneed}, and the OpenAI CLIP encoder
\cite{radford2021clip} for the task embedding. Starter code (the
ACT model skeleton, the LIBERO env wrapper, dataset caching) was provided by
the lab; I extended it with augmentations, the keypoint fix, the new eval
modes, and all experiments and analysis.

\section{AI usage}

I used Claude (Anthropic) extensively as a coding and brainstorming partner.
Specifically: (i) \emph{coding}: generating boilerplate for the
\texttt{warpPerspective}-based augmentation kernel, the multi-stage scoring
loop, and the WandB visualisation overlay; (ii) \emph{debugging}: pair-debugging
the keypoint mismatch (the model's hint that a contradictory input might
explain the in-dist regression accelerated diagnosis by hours);
(iii) \emph{ideation}: it suggested the crop$\rightarrow$noaug curriculum after
I described that crop was helping translation but overfitting; (iv) \emph{writing}:
I drafted this report with Claude and edited every section
manually for accuracy. I cross-checked every numerical claim against the
live JSON results in \texttt{results/*/grid\_results.json}, and I regenerated
the figures in this report from the raw eval logs. I did not let the model
fabricate citations: every reference here was verified to exist with a real
arXiv or venue link before inclusion. All experimental decisions, dataset
construction, training runs, and qualitative interpretations are mine; the
model accelerated execution but did not make scientific calls.

\bibliographystyle{plain}
\begin{thebibliography}{99}

\bibitem{zhao2023learning}
T. Zhao, V. Kumar, S. Levine, C. Finn.
\newblock Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware.
\newblock \emph{RSS}, 2023. \url{https://arxiv.org/abs/2304.13705}

\bibitem{liu2023libero}
B. Liu, Y. Zhu, C. Gao, Y. Feng, Q. Liu, Y. Zhu, P. Stone.
\newblock LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning.
\newblock \emph{NeurIPS Datasets and Benchmarks}, 2023.
\newblock \url{https://arxiv.org/abs/2306.03310}

\bibitem{darcet2023vitneed}
T. Darcet, M. Oquab, J. Mairal, P. Bojanowski.
\newblock Vision Transformers Need Registers.
\newblock \emph{ICLR}, 2024. \url{https://arxiv.org/abs/2309.16588}

\bibitem{he2016deep}
K. He, X. Zhang, S. Ren, J. Sun.
\newblock Deep Residual Learning for Image Recognition.
\newblock \emph{CVPR}, 2016. \url{https://arxiv.org/abs/1512.03385}

\bibitem{radford2021clip}
A. Radford et al.
\newblock Learning Transferable Visual Models From Natural Language Supervision.
\newblock \emph{ICML}, 2021. \url{https://arxiv.org/abs/2103.00020}

\bibitem{sermanet2018time}
P. Sermanet, C. Lynch, Y. Chebotar, J. Hsu, E. Jang, S. Schaal, S. Levine.
\newblock Time-Contrastive Networks: Self-Supervised Learning from Video.
\newblock \emph{ICRA}, 2018. \url{https://arxiv.org/abs/1704.06888}

\bibitem{james2019sim}
S. James, P. Wohlhart, M. Kalakrishnan, D. Kalashnikov, A. Irpan, J. Ibarz,
S. Levine, R. Hadsell, K. Bousmalis.
\newblock Sim-to-Real via Sim-to-Sim: Data-Efficient Robotic Grasping via
Randomized-to-Canonical Adaptation Networks.
\newblock \emph{CVPR}, 2019. \url{https://arxiv.org/abs/1812.07252}

\bibitem{jangir2022look}
R. Jangir, N. Hansen, S. Ghosal, M. Jain, X. Wang.
\newblock Look Closer: Bridging Egocentric and Third-Person Views with
Transformers for Robotic Manipulation.
\newblock \emph{RA-L}, 2022. \url{https://arxiv.org/abs/2201.07779}

\bibitem{tobin2017domain}
J. Tobin, R. Fong, A. Ray, J. Schneider, W. Zaremba, P. Abbeel.
\newblock Domain Randomization for Transferring Deep Neural Networks from
Simulation to the Real World.
\newblock \emph{IROS}, 2017. \url{https://arxiv.org/abs/1703.06907}

\bibitem{cubuk2019autoaugment}
E. D. Cubuk, B. Zoph, D. Mane, V. Vasudevan, Q. V. Le.
\newblock AutoAugment: Learning Augmentation Strategies From Data.
\newblock \emph{CVPR}, 2019. \url{https://arxiv.org/abs/1805.09501}

\bibitem{brohan2022rt1}
A. Brohan et al.
\newblock RT-1: Robotics Transformer for Real-World Control at Scale.
\newblock \emph{RSS}, 2023. \url{https://arxiv.org/abs/2212.06817}

\end{thebibliography}

\end{document}