From b9bbcb674371edf912cb296b400bb295d3de2740 Mon Sep 17 00:00:00 2001 From: lusxvr Date: Tue, 11 Nov 2025 17:17:53 +0100 Subject: [PATCH] Minor text fixes --- main.bib | 15 ++++++--- sections/01_introduction.tex | 24 +++++++------- sections/02_classic_robotics.tex | 14 ++++---- sections/03_reinforcement_learning.tex | 46 +++++++++++++------------- sections/04_imitation_learning.tex | 40 +++++++++++----------- sections/05_foundation_models.tex | 18 +++++----- sections/07_conclusions.tex | 2 +- sections/A_foreword.tex | 2 +- 8 files changed, 83 insertions(+), 78 deletions(-) diff --git a/main.bib b/main.bib index c1ae160..026834d 100644 --- a/main.bib +++ b/main.bib @@ -62,7 +62,8 @@ @article{aldacoALOHA2Enhanced title = {{{ALOHA}} 2: {{An Enhanced Low-Cost Hardware}} for {{Bimanual Teleoperation}}}, author = {Aldaco, Jorge and Armstrong, Travis and Baruch, Robert and Bingham, Jeff and Chan, Sanky and Dwibedi, Debidatta and Finn, Chelsea and Florence, Pete and Goodrich, Spencer and Gramlich, Wayne and Herzog, Alexander and Hoech, Jonathan and Nguyen, Thinh and Storz, Ian and Tabanpour, Baruch and Tompson, Jonathan and Wahid, Ayzaan and Wahrburg, Ted and Xu, Sichun and Yaroshenko, Sergey and Zhao, Tony Z}, langid = {english}, - file = {/Users/fracapuano/Zotero/storage/LDEJG62Q/Aldaco et al. - ALOHA 2 An Enhanced Low-Cost Hardware for Bimanual Teleoperation.pdf} + file = {/Users/fracapuano/Zotero/storage/LDEJG62Q/Aldaco et al. - ALOHA 2 An Enhanced Low-Cost Hardware for Bimanual Teleoperation.pdf}, + year = {2024} } @article{alizadehComprehensiveSurveySpace2024, @@ -906,14 +907,16 @@ @article{kingma2013auto @misc{knightStandardOpenSO100, title = {Standard {{Open SO-100}} \& {{SO-101 Arms}}}, - author = {Knight, Rob and Kooijmans, Pepijn and Wolf, Thomas and Alibert, Simon and Aractingi, Michel and Aubakirova, Dana and Zouitine, Adil and Martino, Russi and Palma, Steven and Pascal, Caroline and Cadene, Remi} + author = {Knight, Rob and Kooijmans, Pepijn and Wolf, Thomas and Alibert, Simon and Aractingi, Michel and Aubakirova, Dana and Zouitine, Adil and Martino, Russi and Palma, Steven and Pascal, Caroline and Cadene, Remi}, + year = {2024} } @article{koberReinforcementLearningRobotics, title = {Reinforcement {{Learning}} in {{Robotics}}: {{A Survey}}}, author = {Kober, Jens and Bagnell, J Andrew and Peters, Jan}, langid = {english}, - file = {/Users/fracapuano/Zotero/storage/72PRHGKL/Kober et al. - Reinforcement Learning in Robotics A Survey.pdf} + file = {/Users/fracapuano/Zotero/storage/72PRHGKL/Kober et al. - Reinforcement Learning in Robotics A Survey.pdf}, + year = {2013} } @inproceedings{kong2024audioflam, @@ -1869,12 +1872,14 @@ @misc{teamGemma2Improving2024 @misc{tedrakeRoboticManipulationPerception, title = {Robotic {{Manipulation}}. {{Perception}}, {{Planning}} and {{Control}}.}, - author = {Tedrake, Russ} + author = {Tedrake, Russ}, + year = {2025} } @misc{tedrakeUnderactuatedRoboticsAlgorithms, title = {Underactuated {{Robotics}}. {{Algorithms}} for {{Walking}}, {{Running}}, {{Swimming}}, {{Flying}}, and {{Manipulation}}}, - author = {Tedrake, Russ} + author = {Tedrake, Russ}, + year = {2024} } @article{thrunPROBABILISTICROBOTICS, diff --git a/sections/01_introduction.tex b/sections/01_introduction.tex index f440de3..9d04fe8 100644 --- a/sections/01_introduction.tex +++ b/sections/01_introduction.tex @@ -11,8 +11,8 @@ \section{Introduction} Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems. -The frontier of robotics research is indeed increasingly moving away from classical model-based control paradigm, embracing the advancements made in ML, aiming to unlock (1) monolithic perception-to-action control pipelines and (2) multi-modal data-driven feature extraction strategies, together with (3) reduced reliance on precise models of the world and (4) a better positioning to benefit from the growing availability of open robotics data. -While central problems in manipulation, locomotion and whole-body control demand knowledge of rigid-body dynamics, contact modeling, planning under uncertainty, recent results seem to indicate learning can prove just as effective as explicit modeling, sparking interest in the field of \emph{robot learning}. +The frontier of robotics research is indeed increasingly moving away from the classical model-based control paradigm, embracing the advancements made in ML, aiming to unlock (1) monolithic perception-to-action control pipelines and (2) multi-modal data-driven feature extraction strategies, together with (3) reduced reliance on precise models of the world and (4) a better positioning to benefit from the growing availability of open robotics data. +While central problems in manipulation, locomotion and whole-body control demand knowledge of rigid-body dynamics, contact modeling, and planning under uncertainty, recent results seem to indicate learning can prove just as effective as explicit modeling, sparking interest in the field of \emph{robot learning}. This interest can be largely justified considering the significant challenges related to deriving accurate models of robot-environment interactions. Moreover, since end-to-end learning on ever-growing collections of text and image data has historically been at the core of the development of \emph{foundation models} capable of semantic reasoning across multiple modalities (images, text, audio, etc.), deriving robotics methods grounded in learning appears particularly consequential, especially as the number of openly available datasets continues to grow. @@ -26,18 +26,18 @@ \section{Introduction} \lerobot~also supports many state-of-the-art (SOTA) algorithms in robot learning---mainly based on Reinforcement Learning (RL) and Behavioral Cloning (BC) techniques---with efficient implementations in Pytorch, and extended support to experimentation and experiments tracking. Lastly, \lerobot~defines a custom, optimized inference stack for robotic policies decoupling action planning from action execution, proving effective in guaranteeing more adaptability at runtime. -This tutorial serves the double purpose of providing useful references for the Science behind---and practical use of---common robot learning techniques. +This tutorial serves the double purpose of providing useful references for the science behind---and practical use of---common robot learning techniques. To this aim, we strike to provide a rigorous yet concise overview of the core concepts behind the techniques presented, paired with practical examples of how to use such techniques concretely, with code examples in \lerobot, for researchers and practitioners interested in the field of robot learning. This tutorial is structured as follows: \begin{itemize} \item Section~\ref{sec:classical} reviews classical robotics foundations, introducing the limitations of dynamics-based approaches to robotics. -\item Section~\ref{sec:learning-rl} elaborates on the limitations of dynamics-based methods, and introduce RL as a practical approach to solve robotics problems, considering its upsides and potential limitations. +\item Section~\ref{sec:learning-rl} elaborates on the limitations of dynamics-based methods, and introduces RL as a practical approach to solve robotics problems, considering its upsides and potential limitations. \item Section~\ref{sec:learning-imitation} further describes robot learning techniques that aim at solving single-tasks learning, leveraging BC techniques to autonomously reproduce specific expert demonstrations. \item Section~\ref{sec:learning-foundation} presents recent contributions on developing generalist models for robotics applications, by learning from large corpora of multi-task \& multi-robot data (\emph{robotics foundation models}). % \item Lastly, Section~\ref{sec:extensions} covers emerging directions in robot learning research, introducing recent works in post-training techniques for robotics foundation models, as well as recent works in world models for robotics. \end{itemize} -Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of Robotics, driving the unprecedented progress we see today. +Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of robotics, driving the unprecedented progress we see today. We complement our presentation of the most common and recent approaches in robot learning with practical code implementations using \lerobot, and start here by presenting the dataset format introduced with \lerobot. \subsection{\lerobotdataset} @@ -47,8 +47,8 @@ \subsection{\lerobotdataset} \lerobotdataset~also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state's streams are proceeding. In this, \lerobotdataset~provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems. -\lerobotdataset~can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in \lerobot, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. -This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users. +\lerobotdataset~can be easily extended and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in \lerobot, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. +This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use. \subsubsection{The dataset class design} @@ -57,9 +57,9 @@ \subsubsection{The dataset class design} Datasets are always organized into three main components: \begin{itemize} -\item \textbf{Tabular Data}: Low-dimensional, high-frequency data such as joint states, and actions are stored in efficient memory-mapped files, and typically offloaded to the more mature \texttt{datasets} library by Hugging Face, providing fast with limited memory consumption. -\item \textbf{Visual Data}: To handle large volumes of camera data, frames are concatenated and encoded into MP4 files. Frames from the same episode are always grouped together into the same video, and multiple videos are grouped together by camera. To reduce stress on the file system, groups of videos for the same camera view are also broke into multiple sub-directories, after a given threshold number. -\item \textbf{Metadata} A collection of JSON files which describes the dataset's structure in terms of its metadata, serving as the relational counterpart to both the tabular and visual dimensions of data. Metadata include the different feature schema, frame rates, normalization statistics, and episode boundaries. +\item \textbf{Tabular Data}: Low-dimensional, high-frequency data such as joint states and actions are stored in efficient memory-mapped files, and typically offloaded to the more mature \texttt{datasets} library by Hugging Face, providing fast access with limited memory consumption. +\item \textbf{Visual Data}: To handle large volumes of camera data, frames are concatenated and encoded into MP4 files. Frames from the same episode are always grouped together into the same video, and multiple videos are grouped together by camera. To reduce stress on the file system, groups of videos for the same camera view are also broken up into multiple sub-directories, after a given threshold number. +\item \textbf{Metadata} A collection of JSON files which describe the dataset's structure in terms of its metadata, serving as the relational counterpart to both the tabular and visual dimensions of data. Metadata includes the different feature schema, frame rates, normalization statistics, and episode boundaries. \end{itemize} For scalability, and to support datasets with potentially millions of trajectories (resulting in hundreds of millions or billions of individual camera frames), we merge data from different episodes into the same high-level structure. @@ -70,7 +70,7 @@ \subsubsection{The dataset class design} \item \texttt{meta/info.json}: This metadata is a central metadata file. It contains the complete dataset schema, defining all features (e.g., \texttt{observation.state}, \texttt{action}), their shapes, and data types. It also stores crucial information like the dataset's frames-per-second (\texttt{fps}), \lerobot's version at the time of capture, and the path templates used to locate data and video files. \item \texttt{meta/stats.json}: This file stores aggregated statistics (mean, std, min, max) for each feature across the entire dataset, used for data normalization for most policy models and accessible externally via \texttt{dataset.meta.stats}. \item \texttt{meta/tasks.jsonl}: This file contains the mapping from natural language task descriptions to integer task indices, which are useful for task-conditioned policy training. -\item \texttt{meta/episodes/*} This directory contains metadata about each individual episode, such as its length, the corresponding task, and pointers to where its data is stored in the dataset's files. For scalability, this information is stored in files rather than a single large JSON file. +\item \texttt{meta/episodes/*} This directory contains metadata about each individual episode, such as its length, the corresponding task, and pointers to where its data is stored in the dataset's files. For scalability, this information is stored in multiple files rather than a single large JSON file. \item \texttt{data/*}: Contains the core frame-by-frame tabular data, using parquet files to allow for fast, memory-mapped access. To improve performance and handle large datasets, data from multiple episodes are concatenated into larger files. These files are organized into chunked subdirectories to keep the size of directories manageable. A single file typically contains data for more than one single episode. \item \texttt{videos/*}: Contains the MP4 video files for all visual observation streams. Similar to the \texttt{data/} directory, the video footage from multiple episodes is concatenated into single MP4 files. This strategy significantly reduces the number of files in the dataset, which is more efficient for modern filesystems. \end{itemize} @@ -80,7 +80,7 @@ \subsection{Code Example: Batching a (Streaming) Dataset} This section provides an overview of how to access datasets hosted on Hugging Face using the \lerobotdataset~class. Every dataset on the Hugging Face Hub containing the three main pillars presented above (Tabular, Visual and relational Metadata), and can be assessed with a single instruction. -In practice, most reinforcement learning (RL) and behavioral cloning (BC) algorithms tend to operate on stack of observation and actions. +In practice, most reinforcement learning (RL) and behavioral cloning (BC) algorithms tend to operate on stacks of observation and actions. For the sake of brevity, we will refer to joint spaces, and camera frames with the single term of \emph{frame}. For instance, RL algorithms may use a history of previous frames \(o_{t-H_o:t} \) to mitigate partial observability, and BC algorithms are in practice trained to regress chunks of multiple actions (\(a_{t+t+H_a} \)) rather than single controls. To accommodate for these specifics of robot learning training, \lerobotdataset~provides a native windowing operation, whereby users can define the \emph{seconds} of a given window (before and after) around any given frame, by using the \texttt{delta\_timestemps} functionality. diff --git a/sections/02_classic_robotics.tex b/sections/02_classic_robotics.tex index ea21f44..2e46f2e 100644 --- a/sections/02_classic_robotics.tex +++ b/sections/02_classic_robotics.tex @@ -4,7 +4,7 @@ \section{Classical Robotics} \epigraph{\textit{Know your enemy} [...]}{Sun Tzu} \begin{tldr} -Learning-based approaches to robotics are motivated by the need to (1) generalize across tasks and embodiments (2) reduce dependency on human expertise (3) leverage historical trends on the production of data---all traditionally overlooked by dynamics-based techniques. +Learning-based approaches to robotics are motivated by the need to (1) generalize across tasks and embodiments, (2) reduce dependency on human expertise, and (3) leverage historical trends on the production of data---all traditionally overlooked by dynamics-based techniques. \end{tldr} \subsection{Explicit and Implicit Models} @@ -16,7 +16,7 @@ \subsection{Explicit and Implicit Models} \label{fig:generating-motion-atlas} \end{figure} -Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. +Robotics is concerned with producing artificial motion in the physical world in a useful, reliable, and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines~\citep{connellRobotLearning1993}. @@ -24,7 +24,7 @@ \subsection{Explicit and Implicit Models} Methods to produce robotics motion range from traditional \emph{explicit} models---\highlight{dynamics-based}\footnote{In here, we refer to both \emph{kinematics} and \emph{dynamics}-based control.} methods, leveraging precise descriptions of the mechanics of robots' rigid bodies and their interactions with eventual obstacles in the environment---to \emph{implicit} models---\highlight{learning-based} methods, treating artificial motion as a statistical pattern to learn given multiple sensorimotor readings~\citep{agrawalComputationalSensorimotorLearning,bekrisStateRobotMotion2024}. A variety of methods have been developed between these two extrema. -For instance, ~\citet{hansenTemporalDifferenceLearning2022} show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learning~\citet{suttonReinforcementLearningIntroduction2018} with Model-Predictive Control (MPC). +For instance, ~\citet{hansenTemporalDifferenceLearning2022} show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learning~\citep{suttonReinforcementLearningIntroduction2018} with Model-Predictive Control (MPC). Conversely, as explicit models may be relying on assumptions proving overly simplistic---or even unrealistic---in practice, learning can prove effective to improve modeling of complex phenomena or complement perception~\citep{mccormacSemanticFusionDense3D2016}. Such examples aim at demonstrating the richness of approaches to robotics, and Figure~\ref{fig:generating-motion-atlas} graphically illustrates some of the most relevant techniques. Such a list is clearly far from being exhaustive, and we refer to~\citet{bekrisStateRobotMotion2024} for a more comprehensive overview of both general and application-specific methods for motion generation. @@ -47,7 +47,7 @@ \subsection{Different Types of Motion} Motions like (2) may result in changes in the robot's physical location within its environment. Generally, modifications to a robot's location within its environment may be considered instances of the general \emph{locomotion} problem, further specified as \emph{wheeled} or \emph{legged} locomotion based on whenever a robot makes use of wheels or leg(s) to move in the environment. Lastly, an increased level of dynamism in the robot-environment interactions can be obtained combining (1) and (2), thus designing systems capable to interact with \emph{and} move within their environment. -This category is problems is typically termed \emph{mobile manipulation}, and is characterized by a typically much larger set of control variables compared to either locomotion or manipulation alone. +This category of problems is typically termed \emph{mobile manipulation}, and is characterized by a typically much larger set of control variables compared to either locomotion or manipulation alone. % Focus on learning-based approaches and manipulation The traditional body of work developed since the very inception of robotics is increasingly complemented by learning-based approaches. @@ -158,7 +158,7 @@ \subsection{Example: Planar Manipulation} Let \( J(q) \) denote the Jacobian matrix of (partial) derivatives of the FK-function \( f_\FK: \mathcal Q \mapsto \mathcal P \), such that \( J(q) = \frac{\partial f_{FK}(q)}{\partial q } \). Then, one can apply the chain rule to any \( p(q) = f_{\FK}(q) \), deriving \( \dot p = J(q) \dot q \), and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control. -Given a desired end-effector trajectory \( \targetvel(t) \) (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds \( \dot q(t) \) solving for joints' \emph{velocities} instead of \emph{configurations}, +Given a desired end-effector trajectory \( \targetvel(t) \), (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds \( \dot q(t) \) solving for joints' \emph{velocities} instead of \emph{configurations}, \begin{align} \dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \targetvel (t) \rVert_2^2 \label{eq:reg_ik_velocity} @@ -187,7 +187,7 @@ \subsubsection{Adding Feedback Loops} Less predictable disturbances however (e.g., \( \dot x_B \leftarrow \dot x_B + \eps, \eps \sim N(0,1) \)) may prove challenging to model analytically, and one could attain the same result of preventing link-object collision by adding a condition on the distance between the midpoint of \( l \) and \( x_B \), enforced through a feedback loop on the position of the robot and object at each control cycle. -To mitigate the effect of modeling errors, sensing noise and other disturbances, classical pipelines indeed do augment diff-IK with feedback control looping back quantities of interest. +To mitigate the effect of modeling errors, sensing noise and other disturbances, classical pipelines indeed do augment diff-IK with feedback control, looping back quantities of interest. In practice, following a trajectory with a closed feedback loop might consist in backwarding the error between the target and measured pose, \( \Delta p = \targetpos - p(q) \), hereby modifying the control applied to \( \dot q = J(q)^+ (\targetvel + k_p \Delta p ) \), with \( k_p \) defined as the (proportional) gain. More advanced techniques for control consisting in feedback linearization, PID control, Linear Quatratic Regulator (LQR) or Model-Predictive Control (MPC) can be employed to stabilize tracking and reject moderate perturbations, and we refer to \citet[Chapter~8]{sicilianoSpringerHandbookRobotics2016} for in-detail explanation of these concepts, or \citep[Chapter~8]{tedrakeRoboticManipulationPerception} for a simple, intuitive example in the case of a point-mass system. @@ -224,4 +224,4 @@ \subsection{Limitations of Dynamics-based Robotics} The curation of academic datasets by large centralized groups of human experts in robotics~\citep{oneillOpenXEmbodimentRobotic2025, khazatskyDROIDLargeScaleInTheWild2025} is now increasingly complemented by a \highlight{growing number of robotics datasets contributed in a decentralized fashion} by individuals with varied expertise. If not tangentially, dynamics-based approaches are not posed to maximally benefit from this trend, which holds the premise of allowing generalization in the space of tasks and embodiments, like data was the cornerstone for advancements in vision~\citep{alayracFlamingoVisualLanguage2022} and natural-language understanding~\citep{brownLanguageModelsAre2020}. -Taken together, these limitations (Figure~\ref{fig:classical-limitations}) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available. \ No newline at end of file +Taken together, these limitations (Figure~\ref{fig:classical-limitations}) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions, and (3) scale gracefully in performance as more robotics data becomes available. \ No newline at end of file diff --git a/sections/03_reinforcement_learning.tex b/sections/03_reinforcement_learning.tex index 92791fe..b90e9ed 100644 --- a/sections/03_reinforcement_learning.tex +++ b/sections/03_reinforcement_learning.tex @@ -17,7 +17,7 @@ \section{Robot (Reinforcement) Learning} \end{figure} Learning-based techniques for robotics naturally address the limitations presented in Section~\ref{sec:classical} (Figure~\ref{fig:robot-learning-upsides}). -In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (\emph{visuomotor policies}) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. +In particular, learning-based techniques typically rely on monolithic prediction-to-action pipelines (\emph{visuomotor policies}) which directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data---an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (\emph{robot learning}) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches. @@ -25,7 +25,7 @@ \section{Robot (Reinforcement) Learning} Being a field at its relative nascent stages, no prevalent technique(s) proves distinctly better than any other in the domain of robot learning. Still, two major classes of methods gained prominence: \highlight{Reinforcement Learning (RL)} and \highlight{Behavioral Cloning (BC)} (Figure~\ref{fig:robot-learning-atlas}). In this section, we provide a conceptual overview of applications of RL to robotics, as well as introduce practical examples of how to use RL within \lerobot. -We then introduce the major limitations RL suffers from, to introduce BC techniques in Section~\ref{sec:learning-imitation} and Section~{sec:learning-foundation}. +We then introduce the major limitations RL suffers from, to introduce BC techniques in Section~\ref{sec:learning-imitation} and Section~\ref{sec:learning-foundation}. \begin{wrapfigure}[23]{r}{0.3\textwidth} \vspace{-\intextsep} @@ -69,7 +69,7 @@ \subsection{A (Concise) Introduction to RL} Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP)~\citep{bellmanMarkovianDecisionProcess1957}. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP's inherently stochastic formulation and (2) providing a theoretically-sound framework for learning \emph{without} an explicit model of the environment dynamics. -While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete \emph{timestep} \( t=0,1,2,3, \dots, T \). +While also accommodating a continuous time formulation, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete \emph{timestep} \( t=0,1,2,3, \dots, T \). MDPs allowing for an unbounded number of interactions (\( T \to + \infty \)) are termed \emph{infinite-horizon}, and opposed to \emph{finite-horizon} MDPs in which \( T \) is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (\emph{episodic}) MDPs. @@ -102,12 +102,12 @@ \subsection{A (Concise) Introduction to RL} \mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P \transitiongiven \ \mathbb P(\action \vert \state). \end{equation} -Policies \( \mathbb P(\action \vert \state) \) are typically indicated as \( \pi(\action \vert \state) \), often parametrized via \( \theta \), yielding \( \pi_\theta (\action \vert \state )\), and are traine by optimizing the (discounted) \emph{return} associated to a given \( \tau \), i.e. the (random) sum of measured rewards over an arbitrary trajectory, +Policies \( \mathbb P(\action \vert \state) \) are typically indicated as \( \pi(\action \vert \state) \), often parametrized via \( \theta \), yielding \( \pi_\theta (\action \vert \state )\), and are trained by optimizing the (discounted) \emph{return} associated to a given \( \tau \), i.e. the (random) sum of measured rewards over an arbitrary trajectory, \[ G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t. \] In that, agents seek to learn control strategies (\emph{policies}, \( \pi_\theta \)) maximizing the expected return \( \mathbb E_{\tau \sim \pi_\theta} G(\tau) \). -For a given dynamics \( \mathcal D \)---i.e., for a given problem---taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies \( \Pi \), yielding the (maximization) target \( J : \Pi \mapsto \mathbb R \) +For given dynamics \( \mathcal D \)---i.e., for a given problem---taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies \( \Pi \), yielding the (maximization) target \( J : \Pi \mapsto \mathbb R \) \begin{align} J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} \left[ G(\tau) \right], \label{eq:RL-j-function} \\ \mathbb P_{\theta; \mathcal D} (\tau) &= \rho \prod_{t=0}^{T-1} \mathcal D \transition \ \pi_\theta (\action \vert \state).\label{eq:traj-probabilities-for-policies} @@ -141,8 +141,8 @@ \subsection{A (Concise) Introduction to RL} \label{fig:rl-algos-atlas} \end{figure} -Popular approaches to continuous state and action space---such as those studied within robotics---include~\citet[TRPO]{schulmanTrustRegionPolicy2017},~\citet[PPO]{ schulmanProximalPolicyOptimization2017} and~\citet[SAC]{ haarnojaSoftActorCriticOffPolicy2018}. -Across manipulation~\citep{akkayaSolvingRubiksCube2019} and locomotion problems~\citep{leeLearningQuadrupedalLocomotion2020}, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. +Popular approaches to continuous state and action space---such as those studied within robotics---include TRPO~\citet{schulmanTrustRegionPolicy2017}, PPO~\citet{schulmanProximalPolicyOptimization2017} and SAC~\citet{haarnojaSoftActorCriticOffPolicy2018}. +Across manipulation~\citep{akkayaSolvingRubiksCube2019} and locomotion problems~\citep{leeLearningQuadrupedalLocomotion2020}, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate proprioperception with multi-modal high-dimensional sensory streams, (3) disregard a description of the environment dynamics by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to~\citet{koberReinforcementLearningRobotics,tangDeepReinforcementLearning2025}. \subsection{Real-world RL for Robotics} @@ -167,8 +167,8 @@ \subsection{Real-world RL for Robotics} Training RL policies in simulation~\citep{tobinDomainRandomizationTransferring2017} addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (\emph{reality gap}, Figure~\ref{fig:synthetic-vs-real-duck}). \emph{Domain randomization}~\citep{tobinDomainRandomizationTransferring2017} (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. -In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies~\citep{akkayaSolvingRubiksCube2019,antonovaReinforcementLearningPivoting2017,jiDribbleBotDynamicLegged2023}. -In practice, DR is performed training in simulation on simulated dynamics \( \mathcal D \), further parametrized as \( \mathcal D \equiv \mathcal D_\xi \), with a \emph{dynamics} (random) vector \( \xi \) drawn an arbitrary distribution, \( \xi \sim \Xi \). +In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performance of sim-to-real transferred policies~\citep{akkayaSolvingRubiksCube2019,antonovaReinforcementLearningPivoting2017,jiDribbleBotDynamicLegged2023}. +In practice, DR is performed training by in simulation on simulated dynamics \( \mathcal D \), further parametrized as \( \mathcal D \equiv \mathcal D_\xi \), with a \emph{dynamics} (random) vector \( \xi \) drawn from an arbitrary distribution, \( \xi \sim \Xi \). For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure~\ref{fig:ducks-on-terrains}), or the center of mass of an object for a manipulation task. Over the course of training---typically at each episode's reset---a new \( \xi \) is drawn, and used to specify the environment's dynamics for that episode. @@ -188,9 +188,9 @@ \subsection{Real-world RL for Robotics} On the other hand, excessive randomization may cause over-regularization and hinder performance~\citep{margolisRapidLocomotionReinforcement2022}. Consequently, the research community investigated approaches to automatically select the randomization distribution \( \Xi \), using signals from the training process or tuning it to reproduce observed real-world trajectories. \citet{akkayaSolvingRubiksCube2019} use a parametric uniform distribution \( \mathcal U(a, b) \) as \( \Xi \), widening the bounds \( a, b \) as training progresses and the agent's performance improves (AutoDR). -While effective, AutoDR requires significant tuning---the bounds are widened by a fixed, pre-specified amount \( \Delta \) along---and may disregard data when performance \emph{does not} improve after a distribution update~\citep{tiboniDomainRandomizationEntropy2024}. \citet{tiboniDomainRandomizationEntropy2024} propose a similar method to AutoDR (DORAEMON) to evolve \( \Xi \) based on the training signal, but with the key difference of explicitly maximizing the entropy of a parametric Beta distribution---inherently more flexible than uniform distributions---with learned updates instead of fixed \( \Delta \). +While effective, AutoDR requires significant tuning---the bounds are widened by a fixed, pre-specified amount \( \Delta \)---and may disregard data when performance \emph{does not} improve after a distribution update~\citep{tiboniDomainRandomizationEntropy2024}. \citet{tiboniDomainRandomizationEntropy2024} propose a similar method to AutoDR (DORAEMON) to evolve \( \Xi \) based on the training signal, but with the key difference of explicitly maximizing the entropy of a parametric Beta distribution---inherently more flexible than uniform distributions---with learned updates instead of a fixed \( \Delta \). In this, DORAEMON proves particularly effective at dynamically increasing the entropy levels of the training distribution by employing an outer-loop max-entropy objective, tackled under performance constraints in the inner-loop RL problem. -Other approaches to automatically perform DR consist in specifically tuning \( \Xi \) to align as much as possible the simulation and real-world domains. +Other approaches to automatically perform DR consist in specifically tuning \( \Xi \) to align the simulation and real-world domains as much as possible. For instance,~\citet{chebotarClosingSimtorealLoop2019} interleave in-simulation policy training with repeated real-world policy rollouts used to adjust \( \Xi \) based on real-world data, while~\citet{tiboniDROPOSimtoRealTransfer2023} leverage a single, pre-collected set of real-world trajectories and tune \( \Xi \) under a simple likelihood objective. While DR has shown promise, it does not address the main limitation that, even under the assumption that an ideal distribution \( \Xi \) was available, many robotics problems \highlight{cannot be simulated with high-enough fidelity under practical computational constraints}. @@ -203,7 +203,7 @@ \subsection{Real-world RL for Robotics} To make the most of (1) the growing number of openly available datasets and (2) relatively inexpensive robots like the SO-100, RL could (1) be anchored in already-collected trajectories---limiting erratic and dangerous exploration---and (2) train in the real-world directly---bypassing the aforementioned issues with low-fidelity simulations. In such a context, sample-efficient learning is also paramount, as training on the real-world is inherently time-bottlenecked. -Off-policy algorithms like Soft Actor-Critic (SAC)~\citep{haarnojaSoftActorCriticOffPolicy2018} tend to be more sample efficient then their on-policy counterpart~\citep{schulmanProximalPolicyOptimization2017}, due to the presence a \emph{replay buffer} used over the course of training. +Off-policy algorithms like Soft Actor-Critic (SAC)~\citep{haarnojaSoftActorCriticOffPolicy2018} tend to be more sample efficient then their on-policy counterpart~\citep{schulmanProximalPolicyOptimization2017}, due to the presence of a \emph{replay buffer} used over the course of training. Other than allowing to re-use past transitions \( \sars \), the replay buffer can also accomodate for the injection of previously-collected data in the training process~\citep{ballEfficientOnlineReinforcement2023}. Using expert demonstrations to guide learning together with learned rewards, RL can be effectively carried out in the real-world~\citep{luoSERLSoftwareSuite2025}. Interestingly, when complemented with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours~\citep{luoPreciseDexterousRobotic2024}. @@ -229,8 +229,8 @@ \subsection{Real-world RL for Robotics} \[ Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \left[ r_t + \gamma \max_{a_{t+1} \in \mathcal A} Q_i (s_{t+1}, a_{t+1}) \big\vert s_t, a_t \right], \quad i=0,1,2,\dots,K \] -Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate \( Q_K \approx Q^* \) at each timestep. -Indeed, one can show that under certain assumptions on the MDP considered, \( Q_K \to Q^* \, \text{as } K \to \infty \). +Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing the final (ideally, near-optimal) estimate \( Q_K \approx Q^* \) over the action space at each timestep. +Indeed, one can show that, under certain assumptions on the MDP, \( Q_K \to Q^* \, \text{as } K \to \infty \). Effective in its early applications to small-scale discrete problems, vanilla Q-learning was found complicated to scale to large \( \statespace \times \actionspace \) problems, in which storing \( Q : \statespace \times \actionspace \mapsto \mathbb R \) alone might result prohibitive. Also, vanilla Q-learning is not directly usable for \emph{continuous}, unstructured state-action space MPDs, such as those considered in robotics. @@ -244,7 +244,7 @@ \subsection{Real-world RL for Robotics} y_i &= \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma \max_{\action \in \mathcal A} Q_{\theta_{i-1}} (\stateplusone, a_{t+1}) \big], \label{eq:TD-target} \end{align} where \( \chi \) represents a behavior distribution over state-action pairs. -Crucially, \( \chi \) can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a \emph{replay buffer} \( D \) in the form of \( \sars \) transitions, used to form the TD-target \( y_i \), TD-error \( \delta_i \) and loss function eq.~\ref{eq:dqn-loss} via Monte-Carlo (MC) estimates. +Crucially, \( \chi \) can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a \emph{replay buffer} \( D \) in the form of \( \sars \) transitions, used to form the TD-target \( y_i \), TD-error \( \delta_i \), and loss function (eq.~\ref{eq:dqn-loss}) via Monte-Carlo (MC) estimates. While effective in handling large, unstructured state spaces for discrete action-space problems, DQN's application to continous control problems proved challenging. Indeed, in the case of high-capacity function approximators such as neural networks, solving \( \max_{a_t \in \mathcal A} Q_\theta(s_t, a_t) \) at each timestep is simply unfeasible due to the (1) continous nature of the action space (\( \actionspace \subset \mathbb R^n \) for some \( n \)) and (2) impossibility to express the policy with a cheap (ideally, even closed-form) formulation, so that \( \max Q_\theta \) could be solved analytically. @@ -253,12 +253,12 @@ \subsection{Real-world RL for Robotics} d_\phi = \mathbb E_{s_t \sim \mathbb P (\bullet)} \left[ \nabla_\phi Q(s_t, a_t)\vert_{a_t = \mu_\phi(s_t)} \right] = \mathbb E_{s_t \sim \mathbb P(\bullet)} \left[ \nabla_{a_t} Q(s_t, a_t) \vert_{a_t = \mu_\phi(s_t)} \cdot \nabla_\phi \mu(s_t) \right] \end{equation} Provably, eq.~\ref{eq:deterministic-pg} is the \emph{deterministic policy gradient} (DPG) of the policy \(\mu_\phi \)~\citep{pmlr-v32-silver14}, so that updates \( \phi_{k+1}\leftarrow \phi_k + \alpha d_\phi \) are guaranteed to increase the (deterministic) cumulative discounted reward, \( J(\mu_\phi) \). -~\citet{lillicrapContinuousControlDeep2019a} extended DPG to the case of (1) high-dimensional unstructured observations and (2) continuous action spaces, introducing Deep Deterministic Policy Gradient (DDPG), an important algorithm in RL and its applications to robotics. +\citet{lillicrapContinuousControlDeep2019a} extended DPG to the case of (1) high-dimensional unstructured observations and (2) continuous action spaces, introducing Deep Deterministic Policy Gradient (DDPG), an important algorithm in RL and its applications to robotics. DDPG adopts a modified TD-target compared to eq.~\ref{eq:TD-target}, by maintaining a policy network used to select actions, yielding \begin{equation}\label{eq:TD-target-ddpg} y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma Q_{\theta_{i-1}} (\stateplusone, \mu_\phi(\stateplusone)) \big] . \end{equation} -Similarily to DQN, DDPG also employs the same replay buffer mechanism, reusing past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates. +Similarily to DQN, DDPG also employs the same replay buffer mechanism, reusing past transitions over training for increased sample efficiency and estimating the loss function via MC-estimates. Soft Actor-Critic (SAC)~\citep{haarnojaSoftActorCriticOffPolicy2018} is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with \highlight{maximizing the discounted cumulative reward, while acting as randomly as possible}. MaxEnt RL~\citep{haarnojaReinforcementLearningDeep2017b} has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. @@ -291,20 +291,20 @@ \subsection{Real-world RL for Robotics} % RLPD + reward classifier: SERL \paragraph{Sample-efficient, data-driven, real-world RL} Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. -Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely on augmenting propioperceptive inputs with camera streams, and thus even well-defined rewards would need to be defined starting from unstructured observation---a challenging assumption in practice. +Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely on augmenting proprioceptive inputs with camera streams, and thus even well-defined rewards would need to be defined starting from unstructured observation---a challenging assumption in practice. In their technical report,~\citet{luoSERLSoftwareSuite2025} empirically address the needs (1) to define a reward function and (2) to use it starting from unstructured, image observations. -In particular,~\citet[SERL]{luoSERLSoftwareSuite2025} introduces a suite of tools streamlining training of \emph{reward classifiers} \( c \), as well as jointly learn forward-backward controllers to speed up real-world RL. +In particular, SERL~\citet{luoSERLSoftwareSuite2025} introduces a suite of tools streamlining training of \emph{reward classifiers} \( c \), as well as jointly learn forward-backward controllers to speed up real-world RL. Reward classifiers are particularly useful in treating complex, dynamic tasks---e.g., folding a t-shirt---for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success (\(e^+\)) or failure (\(e^-\)) states, rather than from a precise formulation of \( r_t \), with a natural target for the reward classifier being \( r(s) = \log c(e^+ \ vert s ) \). Furthermore,~\citet{luoSERLSoftwareSuite2025} demonstrate the benefits of learning separate (1) \emph{forward} and (2) \emph{backward} controllers---parametrized by separate policies---where (1) the former learns to execute a task to completion and (2) the latter learns to reset the environment to its initial state from terminal states, thereby aiding training in real-world episodic settings. -Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability,~\citet{luoSERLSoftwareSuite2025} introduced a modified state and action space, expressing proprioperceptive configurations \( q \) and actions \( \dot q \) in the frame of the end-effector pose at \( t=0 \). +Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability,~\citet{luoSERLSoftwareSuite2025} introduced a modified state and action space, expressing proprioceptive configurations \( q \) and actions \( \dot q \) in the frame of the end-effector pose at \( t=0 \). Randomizing the initial pose of the end-effector (\( s_0 \)),~\citet{luoSERLSoftwareSuite2025} achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach. \begin{figure} \centering \includegraphics[width=0.8\linewidth]{figures/ch3/ch3-hil-serl-examples.pdf} - \caption{(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.} + \caption{(A) HIL-SERL allows for real-world training of high performance RL agents by building on advancements presented by SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.} \label{fig:hil-serl-blocks} \end{figure} @@ -336,7 +336,7 @@ \subsubsection{Code Example: Real-world RL} \item A \texttt{Learner}, used to optimize the policy's parameters \( \theta \) for maximum expected return. The learner samples batches of offline data from online and offline buffers in equal proportion~\citep{ballEfficientOnlineReinforcement2023}, and shares updated parameters with the \texttt{Actor}. \end{itemize} -The HIL-SERL architecture presented in this example can be exclusively run locally, but the implementation in \lerobot~also allows the \texttt{Actor} and \texttt{Learner} to run on two separate machines connected by the network. +The HIL-SERL architecture presented in this example can be run exclusively locally, but the implementation in \lerobot~also allows the \texttt{Actor} and \texttt{Learner} to run on two separate machines connected by a network. % \paragraph{Learning a Reward Classifier} \begin{pbox}[label={ex:train_reward_classifier}]{Training a Reward Classifier \\ \url{https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/01_reward_classifier.py}} @@ -364,7 +364,7 @@ \subsubsection{Limitations of RL in Real-World Robotics: Simulators and Reward D Despite the advancements in real-world RL training, training RL agents for real-world tasks still suffers from the following limitations: \begin{itemize} -\item In those instances where real-world training experience is prohibitively expensive to gather (e.g., Tokamak control~\citep{degraveMagneticControlTokamak2022}, Autonomous Stratospehere Navigation~\citep{bellemareAutonomousNavigationStratospheric2020})in-simulation training is often the only viable option. +\item In those instances where real-world training experience is prohibitively expensive to gather (e.g., Tokamak control~\citep{degraveMagneticControlTokamak2022}, Autonomous Stratospehere Navigation~\citep{bellemareAutonomousNavigationStratospheric2020}) in-simulation training is often the only viable option. However, high-fidelity simulators for real-world problems can be difficult to build and maintain, especially for contact-rich manipulation and tasks involving deformable or soft materials. \item Reward design is a fundamental source of brittleness in real-world RL pipelines. While shaping dense rewards is often necessary to guide exploration in long-horizon tasks, the process is error-prone and heavily reliant on human expertise and intuition. Poorly tuned terms can lead to specification gaming or convergence to local optima, making reward shaping a critical challenge for applying RL in practice. Sparse rewards that only signal successful trajectories can avoid these pitfalls but typically result in much slower learning due to reduced supervision. diff --git a/sections/04_imitation_learning.tex b/sections/04_imitation_learning.tex index 86c6b55..8dfa9b8 100644 --- a/sections/04_imitation_learning.tex +++ b/sections/04_imitation_learning.tex @@ -15,22 +15,22 @@ \section{Robot (Imitation) Learning} \begin{figure} \centering \includegraphics[width=0.8\textwidth]{figures/ch4/ch4-bc-trajectories.pdf} - \caption{(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in \url{lerobot/svla_so101_pickplace}. Proprioperceptive states provide invaluable to determine the robot's state during an episode. (B) Camera frames are also recorded alongside measurements on the robot's state, capturing information about the robot's interaction with its environment.} + \caption{(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in \url{lerobot/svla_so101_pickplace}. proprioceptive states provide invaluable to determine the robot's state during an episode. (B) Camera frames are also recorded alongside measurements on the robot's state, capturing information about the robot's interaction with its environment.} \label{fig:ch4-bc-trajectories} \end{figure} Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section~\ref{sec:learning-rl}. Indeed, especially in real-world robotics, online exploration is typically \highlight{costly and potentially unsafe}, and designing (dense) reward signals is a \highlight{brittle and task-specific} process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets---all factors complicating training RL algorithms on hardware at scale. -Behavioral Cloning (BC) sidesteps these constraints by \highlight{casting control an imitation learning problem}, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. +Behavioral Cloning (BC) sidesteps these constraints by \highlight{casting control as an imitation learning problem}, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by \emph{learning-to-imitate}, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether. Formally, let \( \mathcal D = \{ \tau^{(i)} \}_{i=1}^N \) be a set of expert trajectories, with \( \tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i} \) representing the \(i\)-th length-\(T_i\) trajectory in \( \mathcal D \), \(o_t \in \obsspace \) denoting observations (e.g., images and proprioception altogether), and \(a_t \in \actionspace \) the expert actions. -Typically, observations \( o \in \obsspace \) consist of both image and proprioperceptive information, while actions \( a \in \actionspace \) represent control specifications for the robot to execute, e.g. a joint configuration. +Typically, observations \( o \in \obsspace \) consist of both image and proprioceptive information, while actions \( a \in \actionspace \) represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section~\ref{sec:learning-rl}, in the imitation learning context \( \mathcal D \) denotes an offline dataset collecting \( N \) length-\( T_i \) reward-free (expert) human trajectories \( \tau^{(i)} \), and \emph{not} the environment dynamics. -Similarily, in this section \( \tau^{(i)} \) represent a length-\(T_i\) trajectory of observation-action pairs, which crucially \emph{omits entirely any reward} information. +Similarily, in this section \( \tau^{(i)} \) represent a length-\(T_i\) trajectory of observation-action pairs, which crucially \emph{entirely omits any reward} information. Figure~\ref{fig:ch4-bc-trajectories} graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. -Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. +Notice how proprioceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure~\ref{fig:ch4-observation-action-mapping} shows \( (o_t, a_t) \)-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories \( \tau^{(i)} \) can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors. @@ -38,7 +38,7 @@ \section{Robot (Imitation) Learning} \begin{figure} \centering \includegraphics[width=0.9\textwidth]{figures/ch4/ch4-observation-action-mapping.pdf} - \caption{Sample observations and action pairs over the course of a given trajectory recorded in \url{lerobot/svla_so101_pickplace}. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.} + \caption{Sample observations and action pairs over the course of a given trajectory recorded in \url{lerobot/svla_so101_pickplace}. Observations, comprising of both proprioceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.} \label{fig:ch4-observation-action-mapping} \end{figure} @@ -55,7 +55,7 @@ \section{Robot (Imitation) Learning} However, because we only consider the case where a single offline dataset \( \mathcal D \) of trajectories is available and no more data can be collected, DAgger falls out of our scope. Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. -First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. +First, training happens offline and naturally accomodates for expert demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (\emph{reward hacking}), otherwise inherent in purely reward-based RL~\citep{heessEmergenceLocomotionBehaviours2017}. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. @@ -92,11 +92,11 @@ \subsubsection{Variational Auto-Encoders} \begin{figure} \centering \includegraphics[width=0.8\textwidth]{figures/ch4/ch4-task-effect-on-pairs.pdf} - \caption{Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper's opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.} + \caption{Intuitively, the latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block, the likelihood of a wide gripper's opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.} \label{fig:ch4-task-effect-on-pairs} \end{figure} -A common inductive bias used in GM posits samples \( (o,a) \) are influenced from an unobservable latent variable \( z \in Z \), resulting in: +A common inductive bias used in GM posits that samples \( (o,a) \) are influenced from an unobservable latent variable \( z \in Z \), resulting in: \begin{equation}\label{eq:BC-latent-variable} p (o,a) = \int_{\supp{Z}} p(o,a \vert z) p(z) \end{equation} @@ -182,7 +182,7 @@ \subsubsection{Diffusion Models} p(z_0, z_1, \dots z_T) &= p(z_T) \prod_{t=1}^{T} p(z_{t-1} \vert z_t), \label{eq:BC-multi-latent-model-2} \end{align} where we explicitly showed the marginalization over the multiple latents in eq.~\ref{eq:BC-multi-latent-model-1}, and used the law of conditional probability and Markov property in eq.~\ref{eq:BC-multi-latent-model-2}. -Also, for ease of notation, we will refer to observation-action pairs \( o,a \) as \( z_0 \). +Also, for ease of notation, we will refer to observation-action pairs \( (o,a) \) as \( z_0 \). \begin{figure} \centering @@ -205,7 +205,7 @@ \subsubsection{Diffusion Models} The \emph{true} likelihood \( p(z_{t-1} \vert z_t) \) is instead typically approximated using the parametrization \( p_\theta (z_{t-1} \vert z_t) \). In that, the information contained in the unknwon data distribution is \emph{reconstructed} via a process in which samples from a fixed distribution are iteratively turned into (ideally) high-likelihood samples under \( p(o,a) \)---a process referred to as \emph{denoising}. -Under such model, we can express the log-likelihood of an arbitrary sample \( z_0 \) as: +Under such a model, we can express the log-likelihood of an arbitrary sample \( z_0 \) as: \begin{align} \log p_\theta (z_0) &= \log \int_{\supp{Z_1} \times \supp{Z_2} \times \dots \times \supp{Z_T}} p_\theta(\underbrace{z_0, z_1, z_2, \dots z_T}_{z_{0:T}}) \\ &= \log \int_{\supp{Z_{1:T}}} \frac{p_\theta(z_{0:T}) \cdot q(z_{1:T} \vert z_0)}{q(z_{1:T} \vert z_0)} \label{eq:diffusion-1} \\ @@ -385,19 +385,19 @@ \subsection{Action Chunking with Transformers} \begin{figure} \centering \includegraphics[width=0.75\textwidth]{figures/ch4/ch4-act-encoder.pdf} - \caption{The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned \texttt{[CLS]} token used to aggregate input level information, and predict the style variable \( z \). The encoder is exclusively used to \emph{train} the decoder, and it is entirely disregarded at inference time.} + \caption{The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioceptive information, and a learned \texttt{[CLS]} token used to aggregate input level information, and predict the style variable \( z \). The encoder is exclusively used to \emph{train} the decoder, and it is entirely disregarded at inference time.} \label{fig:ch4-act-encoder} \end{figure} However, the authors claim that using a deterministic procedure to sample \( z \) benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time,~\citet{zhaoLearningFineGrainedBimanual2023} propose simply using \( z = \mathbf{0} \), as the conditional prior on \( z \) used in training is set to be a standard Gaussian. -Further, conditioning on the observation \( o \) is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, \( p_\theta(a \vert z, o) \) at test time. -If at inference \( z \) is sampled from a standard Gaussian, during training \( z \) is sampled from an approximate posterior distribution \(q_\phi(z \vert o, a)\), which, however, disregards image observations and exclusively uses proprioperceptive states to form \( o \) for efficiency reasons. +Further, conditioning on the observation \( o \) is achieved through explicitly feeding proprioceptive and visual observations to the decoder, \( p_\theta(a \vert z, o) \) at test time. +If at inference \( z \) is sampled from a standard Gaussian, during training \( z \) is sampled from an approximate posterior distribution \(q_\phi(z \vert o, a)\), which, however, disregards image observations and exclusively uses proprioceptive states to form \( o \) for efficiency reasons. \begin{figure} \centering \includegraphics[width=0.75\textwidth]{figures/ch4/ch4-act-decoder.pdf} - \caption{The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all \( n \) camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable \( z \) retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices \( K,V \) with the decoder, and is trained to decode fixed position embeddings into action chunks.} + \caption{The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all \( n \) camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioceptive information and style variable \( z \) retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices \( K,V \) with the decoder, and is trained to decode fixed position embeddings into action chunks.} \label{fig:ch4-act-decoder} \end{figure} @@ -422,7 +422,7 @@ \subsubsection{Code Example: Training and Using ACT in Practice} \subsection{Diffusion Policy} DMs have proven very effective in approximating complex highly dimensional distributions, such as distributions over images~\citep{hoDenoisingDiffusionProbabilistic2020} or videos~\citep{polyakMovieGenCast2025}, thanks to their inherent capability to deal with multimodal data, and their training stability. -In Diffusion Policy (DP),~\citet{chiDiffusionPolicyVisuomotor2024} present an application of DMs the field of robot learning, leveraging diffusion to model expert demonstrations in a variety of simulated and real-world tasks. +In Diffusion Policy (DP),~\citet{chiDiffusionPolicyVisuomotor2024} present an application of DMs to the field of robot learning, leveraging diffusion to model expert demonstrations in a variety of simulated and real-world tasks. Similarily to ACT~\citep{zhaoLearningFineGrainedBimanual2023},~\citet{chiDiffusionPolicyVisuomotor2024} (1) adopt a modified \emph{observation-conditioned target distribution} instead of the full joint \( p(o,a) \), and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations' marginal \( p_\theta(o) \) given \(p_\theta(o,a) \), DP's choice to model the data distribution through \( p_\theta(a \vert o) \) also stems from the computational burden of diffusion at test time: generating actions together with observations would require a large number of denoising steps—an unnecessarily slow and ultimately unhelpful process, given that robotics focuses on producing controls rather than reconstructing observations. @@ -454,7 +454,7 @@ \subsection{Diffusion Policy} Notably, the authors ablated the relevance of using RGB camera streams as input to their policy, and observed how high frame-rate visual observations can be used to attain performance (measured as success rate) comparable to that of state-based policies, which are typically trained in simulation with priviledged information not directly available in real-world deployments. As high-frame rate RGB inputs naturally accomodate for dynamic, fast changing environments,~\citet{chiDiffusionPolicyVisuomotor2024}'s conclusion offers significant evidence for learning streamlined control policies directly from pixels. In their work,~\citet{chiDiffusionPolicyVisuomotor2024} also ablate the performance of DP against the size of the dataset collected, showing that DP reliably outperforms the considered baseline for all benchmark sizes considered. -Further, in order accelerate inference,~\citet{chiDiffusionPolicyVisuomotor2024} employ Denoising Diffusion Implicit Models~\citep{songDenoisingDiffusionImplicit2022}, a variant of Denoising Diffusion Probabilistic Models~\citep{hoDenoisingDiffusionProbabilistic2020} (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM's natively stochastic one) inducing the same final distribution's as DDPM's, and yet resulting in 10x less denoising steps at inference time~\citep{chiDiffusionPolicyVisuomotor2024}. +Further, in order to accelerate inference,~\citet{chiDiffusionPolicyVisuomotor2024} employ Denoising Diffusion Implicit Models~\citep{songDenoisingDiffusionImplicit2022}, a variant of Denoising Diffusion Probabilistic Models~\citep{hoDenoisingDiffusionProbabilistic2020} (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM's natively stochastic one) inducing the same final distribution's as DDPM's, and yet resulting in 10x less denoising steps at inference time~\citep{chiDiffusionPolicyVisuomotor2024}. Across a range of simulated and real-world tasks,~\citet{chiDiffusionPolicyVisuomotor2024} find DPs particularly performant when modeling \( \epsilon_\theta \) with a transformer-based network, although the authors note the increased sensitivity of transformer networks to hyperparameters. Thus,~\citet{chiDiffusionPolicyVisuomotor2024} explicitly recommend starting out with a simpler, convolution-based architecture for diffusion (Figure~\ref{fig:diffusion-policy-architecture}), which is however reported to be biased towards learning low-frequency components~\citep{tancikFourierFeaturesLet2020}, and thus may prove more challenging to train with non-smooth action sequences. @@ -483,8 +483,8 @@ \subsection{Optimized Inference} Sync inference allocates computation every \( H_a \) timesteps, resulting in a reduced computational burden (on average) at control time. In contrast, sync inference also inherently hinders the responsiveness of robot systems, introducing blind lags due to the robot being \emph{idle} while computing \( \actionchunk \). -One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk \emph{prediction} \( \actionchunk \) from action \emph{execution} \( a_t \gets \textsc{PopFront}(\actionchunk_t) \). -This decoupled stack, which we refer to as \emph{asynchronous} (async) inference (\ref{alg:async-inference}), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. +One can use the fact that policies output multiple actions at the same time to directly address (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk \emph{prediction} \( \actionchunk \) from action \emph{execution} \( a_t \gets \textsc{PopFront}(\actionchunk_t) \). +This decoupled stack, which we refer to as \emph{asynchronous} (async) inference (alg.~\ref{alg:async-inference}), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a \( \textsc{RobotClient} \) sends an observation \( o_t \) to a \( \textsc{PolicyServer} \), receiving an action chunk \( \actionchunk_t \) once inference is complete (Figure~\ref{fig:ch4-async-inference}). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the \( \textsc{RobotClient} \). In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. @@ -558,7 +558,7 @@ \subsection{Optimized Inference} If not for the aforementioned similarity filter, the \( \textsc{RobotClient} \) would send observations for processing every \( (1 - g) H_a \cdot \Delta t\) seconds, receiving a new chunk of actions every \( (1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S] \), on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure~\ref{fig:ch4-queues} results in a queue which is filled with incoming actions \emph{unless} near-duplicate observations are filtered out from the processing pipeline. -For clarity, the red arrow in~\ref{fig:ch4-queues} highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty. +For clarity, the red arrow in Figure~\ref{fig:ch4-queues} highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty. \subsubsection{Code Example: Using Async Inference} diff --git a/sections/05_foundation_models.tex b/sections/05_foundation_models.tex index 884d4ac..d3d7d10 100644 --- a/sections/05_foundation_models.tex +++ b/sections/05_foundation_models.tex @@ -80,7 +80,7 @@ \subsection{Preliminaries: Models and Data} \subsection{VLAs} Modern recipes to train large scale VLAs extend early efforts to learn foundation models from large amounts of data via BC, introducing significant advancements concerning both architectural and procedural aspects. From an architectural perspective, modern VLAs such as \pizero~\citep{black$p_0$VisionLanguageActionFlow2024} leverage a \emph{unified transformer model} for efficiency of computation, while maintaining specialized sub-components within the model for visual perception and action prediction, enabling cross-task performance via language conditioning. -Crucially, modern VLAs including\pizero~\citep{black$p_0$VisionLanguageActionFlow2024} and SmolVLA~\citep{shukorSmolVLAVisionLanguageActionModel2025} adopt \emph{unified} transformer models employing disjoint set of weights (\emph{experts}) for both compute-efficient visual-semantic understanding as well as control. +Crucially, modern VLAs including \pizero~\citep{black$p_0$VisionLanguageActionFlow2024} and SmolVLA~\citep{shukorSmolVLAVisionLanguageActionModel2025} adopt \emph{unified} transformer models employing disjoint set of weights (\emph{experts}) for both compute-efficient visual-semantic understanding as well as control. Procedurally, VLAs complement advanced Vision-Language Model (VLM) backbones with action-specific modules (1) adopting mid-sized \emph{action experts} to model continuous actions distributions \( p (a_{t:t+H_a} \vert o_t) \)---avoiding discrete action tokens entirely---and (2) relying on~\emph{action chunking}~\citep[Section~\ref{sec:learning-imitation}]{zhaoLearningFineGrainedBimanual2023} as a strategy to reduce error compounding when predicting multiple actions learning from inherently non-i.i.d. data, such as demonstration data. These architectural and procedural innovations present three benefits over task-specific methods. @@ -113,16 +113,16 @@ \subsection{\( \pi_0 \)} \begin{figure} \centering \includegraphics[width=0.9\textwidth]{figures/ch5/ch5-pi0.pdf} - \caption{The \pizero~architecture, as in~\citet{black$p_0$VisionLanguageActionFlow2024}. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.} + \caption{The \pizero~architecture, as in~\citet{black$p_0$VisionLanguageActionFlow2024}. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.} \label{fig:ch5-pi0} \end{figure} Concretely, \( \pi_0 \) is a single, unified transformer with two disjoint sets of weights \( \phi, \theta\). A larger VLM backbone \( f_\phi \) initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points \( [\{ I_t \}_{t=1}^n] \), as well as a language instruction \([\ell_t]\) used to describe the task considered. -Concurrently, a 300M-parameter \emph{action expert} based on a similar transformer architecture is used to process both the robot proprioperceptive state \(q_t\) and an action chunk \(a_{t:t+H_a}\) (Figure~\ref{fig:ch5-pi0}). +Concurrently, a 300M-parameter \emph{action expert} based on a similar transformer architecture is used to process both the robot proprioceptive state \(q_t\) and an action chunk \(a_{t:t+H_a}\) (Figure~\ref{fig:ch5-pi0}). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. -In particular, \pizero~uses a \emph{blockwise causal attention mask} over tokens belonging to three separate blocks: (1) image and language tokens \(\mathcal T_i \) obtained from \([\{ I_t \}_{t=1}^n, \ell_t]\), (2) proprioperceptive tokens \(\mathcal T_q \) obtained from \(q_t\), and (3) the action tokens \( \mathcal T_a \) for items in the chunk \(a^{\tau}_{t:t+H_a}\) at time \( \tau \) in the flow-matching process. +In particular, \pizero~uses a \emph{blockwise causal attention mask} over tokens belonging to three separate blocks: (1) image and language tokens \(\mathcal T_i \) obtained from \([\{ I_t \}_{t=1}^n, \ell_t]\), (2) proprioceptive tokens \(\mathcal T_q \) obtained from \(q_t\), and (3) the action tokens \( \mathcal T_a \) for items in the chunk \(a^{\tau}_{t:t+H_a}\) at time \( \tau \) in the flow-matching process. Notably, \emph{within} each block the attention operations are bidirectional, while \emph{across} blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: \begin{equation*} @@ -137,7 +137,7 @@ \subsection{\( \pi_0 \)} \end{equation*} Note how \emph{intra}-block directional attention allows tokens to communicate freely, while \emph{inter}-block communication is mediated by the attention mask \(\mathbf{A} \). \emph{Blockwise causal masking} effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. -Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference. +Crucially, because communication is obstructed between image-language tokens, proprioceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference. In \pizero, both the VLM backbone and action expert are update using a \emph{flow matching} loss, and in particular are updated minimizing: \begin{align} @@ -165,7 +165,7 @@ \subsection{\( \pi_0 \)} In turn, the application of flow matching to large-scale datasets of multiple human behaviors across tasks and embodiments appears rather consequential, particularly considering how it can enable faster inference via a limited number of denoising steps at test time---as few as 10, in \pizero. In particular, the action expert is implemented as a conditional flow matching model. Each action token embeds a noisy action \(a_i^{\tau} \in a^\tau_{t:t+H_a}\), alongside a sinusoidal encoding of the \emph{flow process} timestep \(\tau\). -The action expert then leverages full bidirectional attention across the \(H_a\) action tokens provided, and also attends to previous proprioperceptive and image-language tokens. +The action expert then leverages full bidirectional attention across the \(H_a\) action tokens provided, and also attends to previous proprioceptive and image-language tokens. Interestingly, differently from a standard flow matching pipeline~\citep{lipmanFlowMatchingGenerative2023}, \(\tau\) is \emph{not} sampled from a uniform distribution \(\tau \sim \mathcal U([0,1]) \), but rather obtained from \(\tau \sim \textrm{Beta}(1.5,1) \) defined on the \( [0,s], s<1 \) support (Figure~\ref{fig:ch5-pi0-sampling-timesteps}). \begin{wrapfigure}{r}{0.4\textwidth} @@ -204,7 +204,7 @@ \subsection{SmolVLA} \begin{figure} \centering \includegraphics[width=0.9\textwidth]{figures/ch5/ch5-smolvla.pdf} - \caption{The SmolVLA architecture, as in~\citet{shukorSmolVLAVisionLanguageActionModel2025}. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than \pizero~(450M parameters vs. \pizero's 3.3B).} + \caption{The SmolVLA architecture, as in~\citet{shukorSmolVLAVisionLanguageActionModel2025}. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than \pizero~(450M parameters vs. \pizero's 3.3B).} \label{fig:ch5-smolvla} \end{figure} @@ -218,11 +218,11 @@ \subsection{SmolVLA} \citet{shukorSmolVLAVisionLanguageActionModel2025}'s design choices thus result in a much smaller size model compared to \pizero, consisting of ca. 450M parameters versus \pizero's 3.3B parameters. In practice, SmolVLA consumes multi-view RGB images, a natural-language instruction, and projected sensorimotor state token as inputs, together with the noised \emph{action chunk} \( \tilde{a}_{t:t+H_a} \) the action expert \( v_\theta \) is trained to denoise. -The robot proprioperceptive states are projected to a shared token space with the VLM to match \( d_{\text{VLM}} \), and successively projected into the expert's token space. +The robot proprioceptive states are projected to a shared token space with the VLM to match \( d_{\text{VLM}} \), and successively projected into the expert's token space. Similarily to \pizero, SmolVLA adopts separate experts communicating exclusively through self-attention layers, which however do not employ blockwise causal attention masking and rather favour simple causal masking. In contrast with \pizero, the action expert interleaves \emph{cross-attention} (CA) and \emph{self-attention} (SA) layers, a choice shown to yield higher success and smoother action chunks in practice. -While in the expert SA layers tokens are used to obtain queries, keys and values, CA layers use action tokens only as queries, and instead project visual, language and proprioperceptive tokens from the VLM backbone to a shared embedding space to then obtain keys and values. +While in the expert SA layers tokens are used to obtain queries, keys and values, CA layers use action tokens only as queries, and instead project visual, language and proprioceptive tokens from the VLM backbone to a shared embedding space to then obtain keys and values. Notably, keys and values can be cached here as well, resulting in performance gains at inference time. SmolVLA also trims down both token and layer compute. diff --git a/sections/07_conclusions.tex b/sections/07_conclusions.tex index 7368e26..908c322 100644 --- a/sections/07_conclusions.tex +++ b/sections/07_conclusions.tex @@ -5,7 +5,7 @@ \section{Conclusions} Our exploration traced a clear trajectory of progress, beginning with Reinforcement Learning (RL). While RL offers a powerful paradigm for learning through interaction, its application in robotics is complicated by challenges such as sample inefficiency, safety concerns in real-world training, and the complexities of reward design. We saw how modern approaches like HIL-SERL make real-world RL more feasible by incorporating training-time human guidance, datasets of previously collected data as well as learned reward classifiers. -Nonetheless, the inherent difficulties of RL increasingly motivate approaches based on imitation learning, capable to safely learns from limited numbers of real-world, reward-free expert demonstrations. In turn, the wider adoption of imitation learning led to the development of single-task policies, where advanced Behavioral Cloning techniques---implemented as state-conditioned generative models like Action Chunking with Transformers and Diffusion Policy---have demonstrated the ability to learn complex, multimodal behaviors from human demonstrations. These advancements laid the groundwork for the current frontier: generalist, language-conditioned Vision-Language-Action models capable to perform few- and zero-shot a variety of different real-world tasks. By leveraging powerful pre-trained backbones and sophisticated generative methods like flow matching, models such as \pizero~and SmolVLA represent a significant leap towards foundational models for robotics capable of generalizing across diverse tasks, and even robot embodiments. +Nonetheless, the inherent difficulties of RL increasingly motivate approaches based on imitation learning, capable to safely learn from limited numbers of real-world, reward-free expert demonstrations. In turn, the wider adoption of imitation learning led to the development of single-task policies, where advanced Behavioral Cloning techniques---implemented as state-conditioned generative models like Action Chunking with Transformers and Diffusion Policy---have demonstrated the ability to learn complex, multimodal behaviors from human demonstrations. These advancements laid the groundwork for the current frontier: generalist, language-conditioned Vision-Language-Action models capable to perform few- and zero-shot inference in a variety of different real-world tasks. By leveraging powerful pre-trained backbones and sophisticated generative methods like flow matching, models such as \pizero~and SmolVLA represent a significant leap towards foundational models for robotics capable of generalizing across diverse tasks, and even robot embodiments. A central theme of this work is the critical role of openness in accelerating this progress. The recent explosion in capability is inseparable from the advent of large-scale, openly available datasets, standardized, stable and accessible model architectures, and accessible, open-source software like \lerobot. We argue this convergence on open-source robotics is not a mere trend but a fundamental enabler, democratizing access to research and unlocking the potential of large, decentralized efforts to advance the field. diff --git a/sections/A_foreword.tex b/sections/A_foreword.tex index 144c9da..1618f69 100644 --- a/sections/A_foreword.tex +++ b/sections/A_foreword.tex @@ -7,7 +7,7 @@ \section*{Foreword} Learning can play a pivotal role in the development of autonomous robots: we believe this to be the case. -Nonetheless, we also hold that the wealth of research from both academia and industry in classical robotics over the past six decades is, simply put, too valuable to be cast aside in favor of purely learning-based methods. +Nonetheless, we also hold the opinion that the wealth of research from both academia and industry in classical robotics over the past six decades is, simply put, too valuable to be cast aside in favor of purely learning-based methods. However, the interplay between classical robotics and modern machine learning is still in its nascent stages, and the path to integration yet to be clearly defined. In turn our goal here is to present what we consider to be the most relevant approaches within robot learning today, while warmly extending an invite to collaborate to expand the breadth of this work! Start contributing today \href{https://github.com/fracapuano/robot-learning-tutorial}{here}.