| Show Detail | 
                            
                                Timezone: Europe/Rome
                            
                         | 
                Filter Rooms:
                    
                     
                        
            
            
        SUN 29 SEP
            8 a.m.
                        
                            
                                (ends 6:00 PM)
                                    
                                9 a.m.
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                10:30 a.m.
                        
                            
                                
                            
                        
                    1 p.m.
                        
                            
                                
                            
                        
                    2 p.m.
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 6:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 6:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 6:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 6:00 PM)
                                    
                                3:30 p.m.
                        
                            
                                
                            
                        
                    MON 30 SEP
            8 a.m.
                        
                            
                                (ends 6:00 PM)
                                    
                                9 a.m.
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                Tutorial:
                                        
                                    
                                    
                                    
                                    (ends 1:00 PM)
                                    
                                10:30 a.m.
                        
                            
                                
                            
                        
                    1 p.m.
                        
                            
                                
                            
                        
                    2 p.m.
                        
                            
                                
                            
                        
                            
                                
                            
                        
                            
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 6:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 6:00 PM)
                                    
                                Workshop:
                                        
                                    
                                    
                                    
                                    (ends 6:00 PM)
                                    
                                3:30 p.m.
                        
                            
                                
                            
                        
                    TUE 1 OCT
            7 a.m.
                        
                            
                                (ends 6:30 PM)
                                    
                                8 a.m.
                        
                            
                                (ends 9:00 AM)
                                9 a.m.
                        
                            
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    Towards Scene Graph Anticipation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    OP-Align: Object-level and Part-level Alignment for Self-supervised Category-level Articulated Object Pose Estimation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    PDiscoFormer: Relaxing Part Discovery Constraints with Vision Transformers
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Bi-directional Contextual Attention for 3D Dense Captioning
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    OmniNOCS: A unified NOCS dataset and model for 3D lifting of 2D objects
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    ABC Easy as 123: A Blind Counter for Exemplar-Free Multi-Class Class-agnostic Counting
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    A Fair Ranking and New Model for Panoptic Scene Graph Generation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    Expanding Scene Graph Boundaries: Fully Open-vocabulary Scene Graph Generation via Visual-Concept Alignment and Retention
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    Making Large Language Models Better Planners with Reasoning-Decision Alignment
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD Mapping
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    M^2Depth: Self-supervised Two-Frame Multi-camera Metric Depth Estimation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    H-V2X: A Large Scale Highway Dataset for BEV Perception
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Adaptive Bounding Box Uncertainties via Two-Step Conformal Prediction
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    DriveLM: Driving with Graph Visual Question Answering
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    RealGen: Retrieval Augmented Generation for Controllable Traffic Scenarios
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    Mask2Map: Vectorized HD Map Construction Using Bird's Eye View Segmentation Masks
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    Integer-Valued Training and Spike-driven Inference Spiking Neural Network for High-performance and Energy-efficient Object Detection
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    Latent Diffusion Prior Enhanced Deep Unfolding for Snapshot Spectral Compressive Imaging
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    SEA-RAFT: Simple, Efficient, Accurate RAFT for Optical Flow
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Photon Inhibition for Energy-Efficient Single-Photon Imaging
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Minimalist Vision with Freeform Pixels
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    Flying with Photons: Rendering Novel Views of Propagating Light
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    A Simple Low-bit Quantization Framework for Video Snapshot Compressive Imaging
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    GazeXplain: Learning to Predict Natural Language Explanations of Visual Scanpaths
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Demonstrations 9:00-12:30
                                                            
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                        (ends 12:30 PM)
                                10:30 a.m.
                        
                            
                                (ends 12:30 PM)
                                noon
                        
                            
                                
                            
                        
                    12:30 p.m.
                        
                            
                                
                            
                        
                    1:30 p.m.
                        
                            
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    EDTalk: Efficient Disentanglement for Emotional Talking Head Synthesis
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    TexDreamer: Towards Zero-Shot High-Fidelity 3D Human Texture Generation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    LGM: Large Multi-View Gaussian Model for High-Resolution 3D Content Creation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    FlashTex: Fast Relightable Mesh Texturing with LightControlNet
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    TextDiffuser-2: Unleashing the Power of Language Models for Text Rendering
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    LLMGA: Multimodal Large Language Model based Generation Assistant
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    Accelerating Image Generation with Sub-path Linear Approximation Model
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane Representation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    Bridging the Gap: Studio-like Avatar Creation from a Monocular Phone Capture
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    Zero-Shot Detection of AI-Generated Images
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    Action2Sound: Ambient-Aware Generation of  Action Sounds from Egocentric Videos
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    Efficient Bias Mitigation Without Privileged Information
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    Fast Diffusion-Based Counterfactuals for Shortcut Removal and Generation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    MobileNetV4: Universal Models for the Mobile Ecosystem
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    Momentum Auxiliary Network for Supervised Local Learning
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    From Fake to Real: Pretraining on Balanced Synthetic Images to Prevent Spurious Correlations in Image Recognition
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    Dataset Enhancement with Instance-Level Augmentations
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    Adaptive Parametric Activation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    Relation DETR: Exploring Explicit Position Relation Prior for Object Detection
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    Projecting Points to Axes: Oriented Object Detection via Point-Axis Representation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    CLIFF: Continual Latent Diffusion for Open-Vocabulary Object Detection
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    On Calibration of Object Detectors: Pitfalls, Evaluation and Baselines
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    Physics-Free Spectrally Multiplexed Photometric Stereo under Unknown Spectral Composition
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    COMO: Compact Mapping and Odometry
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    Smoothness, Synthesis, and Sampling: Re-thinking Unsupervised Multi-View Stereo with DIV Loss
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    ADen: Adaptive Density Representations for Sparse-view Camera Pose Estimation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    SPVLoc: Semantic Panoramic Viewport Matching for 6D Camera Localization in Unseen Environments
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    Six-Point Method for Multi-Camera Systems with Reduced Solution Space
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    Scene Coordinate Reconstruction: Posing of Image Collections via Incremental Learning of a Relocalizer
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    Grounding Image Matching in 3D with MASt3R
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    ConDense: Consistent 2D-3D Pre-training for Dense and Sparse Features from Multi-View Images
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    Correspondences of the Third Kind: Camera Pose Estimation from Object Reflection
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    Camera Calibration using a Collimator System
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                2:30 p.m.
                        
                            
                                Demonstrations 2:30-6:00
                                                            
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                        (ends 6:00 PM)
                                3:30 p.m.
                        
                            
                                Keynote:
                                        
                                    
                                    
                                    
                                        Lourdes Agapito · Vittorio Ferrari
                                    
                                    (ends 4:30 PM)
                                    
                                4:30 p.m.
                        
                            
                                
                            
                        
                            
                                Posters 4:30-6:30
                                                            
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                                    
                                                                        
                                                                        
                                                                    
                                                                    
                                                                    SPHINX: A Mixer of Weights, Visual Embeddings and Image Scales for Multi-modal Large Language Models
                                                                    
                                                                        
                                                                    
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                                    
                                                                        
                                                                        
                                                                    
                                                                    
                                                                    Enhancing Source-Free Domain Adaptive Object Detection with Low-confidence Pseudo Label Distillation
                                                                    
                                                                        
                                                                    
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                        (ends 6:30 PM)
                                6:30 p.m.
                        
                            
                                
                            
                        
                    WED 2 OCT
            8 a.m.
                        
                            
                                (ends 6:30 PM)
                                    
                                9 a.m.
                        
                            
                                (ends 12:30 PM)
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    PetFace: A Large-Scale Dataset and Benchmark for Animal Identification
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    UniIR: Training and Benchmarking Universal Multimodal Information Retrievers
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    Towards Model-Agnostic Dataset Condensation by Heterogeneous Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Parrot Captions Teach CLIP to Spot Text
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Towards Open-ended Visual Quality Comparison
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    VETRA: A Dataset for Vehicle Tracking in Aerial Imagery - New Challenges for Multi-Object Tracking
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    Insect Identification in the Wild: The AMI Dataset
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    MarineInst: A Foundation Model for Marine Image Analysis with Instance Visual Description
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    PathMMU: A Massive Multimodal Expert-Level Benchmark for Understanding and Reasoning in Pathology
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    Self-Supervised Video Desmoking for Laparoscopic Surgery
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    CardiacNet: Learning to Reconstruct Abnormalities for Cardiac Disease Assessment from Echocardiogram Videos
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Rethinking Deep Unrolled Model for Accelerated MRI Reconstruction
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Adaptive Correspondence Scoring for Unsupervised Medical Image Registration
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    Revisiting Adaptive Cellular Recognition Under Domain Shifts: A Contextual Correspondence View
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    SparseSSP: 3D Subcellular Structure Prediction from Sparse-View Transmitted Light Images
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    Knowledge-enhanced Visual-Language Pretraining for Computational Pathology
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    HGL: Hierarchical Geometry Learning for Test-time Adaptation in 3D Point Cloud Segmentation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    PointLLM: Empowering Large Language Models to Understand Point Clouds
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    RISurConv: Rotation Invariant Surface Attention-Augmented Convolutions for 3D Point Cloud Classification and Segmentation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    DVLO: Deep Visual-LiDAR Odometry with Local-to-Global Feature Fusion and Bi-Directional Structure Alignment
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    KeypointDETR: An End-to-End 3D Keypoint Detector
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    Rethinking Data Augmentation for Robust LiDAR Semantic Segmentation in Adverse Weather
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    RAPiD-Seg: Range-Aware Pointwise Distance Distribution Networks for 3D LiDAR Segmentation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    Equi-GSPR: Equivariant SE(3) Graph Network Model for Sparse Point Cloud Registration
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                10:30 a.m.
                        
                            
                                
                            
                        
                            
                                (ends 12:30 PM)
                                12:30 p.m.
                        
                            
                                
                            
                        
                    1:30 p.m.
                        
                            
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    Generative Camera Dolly: Extreme Monocular Dynamic Novel View Synthesis
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    Gaussian Frosting: Editable Complex Radiance Fields with Real-Time Rendering
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic Integration
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    FisherRF: Active View Selection and Mapping with Radiance Fields using Fisher Information
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    RaFE: Generative Radiance Fields Restoration
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    Watch Your Steps: Local Image and Scene Editing by Text Instructions
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    MVSplat: Efficient 3D Gaussian Splatting from Sparse Multi-View Images
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    RPBG: Towards Robust Neural Point-based Graphics in the Wild
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    Omni-Recon: Harnessing Image-based Rendering for General-Purpose Neural Radiance Fields
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    Learning 3D-aware GANs from Unposed Images with Template Feature Field
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    MIGS: Multi-Identity Gaussian Splatting via Tensor Decomposition
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    LEGO: Learning EGOcentric Action Frame Generation via Visual Instruction Tuning
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    SV3D: Novel Multi-view Synthesis and 3D Generation from a Single Image using Latent Video Diffusion
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    Efficient Neural Video Representation with Temporally Coherent Modulation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    Clearer Frames, Anytime: Resolving Velocity Ambiguity in Video Frame Interpolation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    Video Editing via Factorized Diffusion Distillation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    ReSyncer: Rewiring Style-based Generator for Unified Audio-Visually Synced Facial Performer
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    Audio-Synchronized Visual Animation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    MotionDirector: Motion Customization of Text-to-Video Diffusion Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    ZoLA: Zero-Shot Creative Long Animation Generation with Short Video Model
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    Temporal Residual Guided Diffusion Framework for Event-Driven Video Reconstruction
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    AttentionHand: Text-driven Controllable Hand Image Generation for 3D Hand Reconstruction in the Wild
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    Sapiens: Foundation for Human Vision Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    POET: Prompt Offset Tuning for Continual Human Action Adaptation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    Harnessing Text-to-Image Diffusion Models for Category-Agnostic Pose Estimation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    SemGrasp: Semantic Grasp Generation via Language Aligned Discretization
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    UGG: Unified Generative Grasping
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    NL2Contact: Natural Language Guided 3D Hand-Object Contact Modeling with Diffusion Model
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    Beyond the Contact: Discovering Comprehensive Affordance for 3D Objects from Pre-trained 2D Diffusion Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    LiveHPS++: Robust and Coherent Motion Capture in Dynamic Free Environment
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    Controllable Human-Object Interaction Synthesis
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    NeRMo: Learning Implicit Neural Representations for 3D Human Motion Prediction
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                2:30 p.m.
                        
                            
                                Demonstrations 2:30-6:00
                                                            
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                        (ends 6:00 PM)
                                3:30 p.m.
                        
                            
                                Keynote:
                                        
                                    
                                    
                                    
                                        Sandra Wachter
                                    
                                    (ends 4:30 PM)
                                    
                                THU 3 OCT
            8 a.m.
                        
                            
                                (ends 6:30 PM)
                                    
                                9 a.m.
                        
                            
                                Demonstrations 9:00-12:30
                                                            
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                        (ends 12:30 PM)
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    WPS-SAM: Towards Weakly-Supervised Part Segmentation with Foundation Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    AlignDiff: Aligning Diffusion Models for General Few-Shot Segmentation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    CAT-SAM: Conditional Tuning for Few-Shot Adaptation of Segment Anything Model
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Collaborative Vision-Text Representation Optimizing for Open-Vocabulary Segmentation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Efficient Active Domain Adaptation for Semantic Segmentation by Selecting Information-rich Superpixels
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    ActionVOS: Actions as Prompts for Video Object Segmentation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    Learning Modality-agnostic Representation for Semantic Segmentation from Any Modalities
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    Diffusion Models for Open-Vocabulary Segmentation
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    Robust Fitting on a Gate Quantum Computer
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    Geospecific View Generation - Geometry-Context Aware High-resolution Ground View Inference from Satellite Views
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    Language-Driven 6-DoF Grasp Detection Using Negative Prompt Guidance
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    MaxMI: A Maximal Mutual Information Criterion for Manipulation Concept Discovery
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Align before Collaborate: Mitigating Feature Misalignment for Robust Multi-Agent Perception
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    Faceptor: A Generalist Model for Face Perception
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    A Geometric Distortion Immunized Deep Watermarking Framework with Robustness Generalizability
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    COHO: Context-Sensitive City-Scale Hierarchical Urban Layout Generation
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Orals 9:00-10:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    PiTe: Pixel-Temporal Alignment for Large Video-Language Model
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    Pose-Aware Self-Supervised Learning with Viewpoint Trajectory Regularization
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    Emergent Visual-Semantic Hierarchies in Image-Text Representations
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Learning Multimodal Latent Generative Models with Energy-Based Prior
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Decoupling Common and Unique Representations for Multimodal Self-supervised Learning
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    SINDER: Repairing the Singular Defects of DINOv2
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    Denoising Vision Transformers
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:10]
                                                                        
                                                                    
                                                                    Exploring the Feature Extraction and Relation Modeling For Light-Weight Transformer Tracking
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                10:30 a.m.
                        
                            
                                (ends 12:30 PM)
                                12:30 p.m.
                        
                            
                                
                            
                        
                    1:30 p.m.
                        
                            
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    Controlling the World by Sleight of Hand
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    Pyramid Diffusion for Fine 3D Large Scene Generation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    FMBoost: Boosting Latent Diffusion with Flow Matching
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    ConceptExpress: Harnessing Diffusion Models for Single-image Unsupervised Concept Extraction
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    Exact Diffusion Inversion via Bidirectional Integration Approximation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    Tackling Structural Hallucination in Image Translation with Local Diffusion
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    Diffusion Prior-Based Amortized Variational Inference for Noisy Inverse Problems
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    Adversarial Diffusion Distillation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    Arc2Face: A Foundation Model for ID-Consistent Human Faces
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in Federated Class Continual Learning
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable Diffusion Model
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    E3M: Zero-Shot Spatio-Temporal Video Grounding with Expectation-Maximization Multimodal Modulation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    Animal Avatars: Reconstructing Animatable 3D Animals from Casual Videos
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    Made to Order: Discovering monotonic temporal changes via self-supervised video ordering
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    MAGR: Manifold-Aligned Graph Regularization for Continual Action Quality Assessment
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    C2C: Component-to-Composition Learning for Zero-Shot Compositional Action Recognition
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    LongVLM: Efficient Long Video Understanding via Large Language Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    Propose, Assess, Search: Harnessing LLMs for Goal-Oriented Planning in Instructional Videos
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    Towards Neuro-Symbolic Video Understanding
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    Classification Matters: Improving Video Action Detection with Class-Specific Attention
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    DEVIAS: Learning Disentangled Video Representations of Action and Scene
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    Sync from the Sea: Retrieving Alignable Videos from Large-Scale Datasets
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                Orals 1:30-3:20
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:30]
                                                                        
                                                                    
                                                                    GiT: Towards Generalist Vision Transformer through Universal Language Interface
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:40]
                                                                        
                                                                    
                                                                    Omniview-Tuning: Boosting Viewpoint Invariance of Vision-Language Pre-training Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [1:50]
                                                                        
                                                                    
                                                                    Turbo: Informativity-Driven Acceleration Plug-In for Vision-Language Large Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:00]
                                                                        
                                                                    
                                                                    MMBENCH: Is Your Multi-Modal Model an All-around Player?
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:10]
                                                                        
                                                                    
                                                                    Strengthening Multimodal Large Language Model with Bootstrapped Preference Optimization
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:20]
                                                                        
                                                                    
                                                                    Beat-It: Beat-Synchronized Multi-Condition 3D Dance Generation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:30]
                                                                        
                                                                    
                                                                    A Simple Baseline for Spoken Language to Sign Language Translation with 3D Avatars
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:40]
                                                                        
                                                                    
                                                                    HYPE: Hyperbolic Entailment Filtering for Underspecified Images and Texts
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [2:50]
                                                                        
                                                                    
                                                                    An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference Acceleration for Large Vision-Language Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:00]
                                                                        
                                                                    
                                                                    uCAP: An Unsupervised Prompting Method for Vision-Language Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [3:10]
                                                                        
                                                                    
                                                                    BRAVE: Broadening the visual encoding of vision-language models
                                                                    
                                                                
                                                            
                                                        (ends 3:30 PM)
                                2:30 p.m.
                        
                            
                                Demonstrations 2:30-6:00
                                                            
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                                
                                                            
                                                        (ends 6:00 PM)
                                3:30 p.m.
                        
                            
                                
                            
                        
                    4:30 p.m.
                        
                            
                                
                            
                        
                            
                                
                            
                        
                    7:30 p.m.
                        
                            
                                
                            
                        
                    FRI 4 OCT
            8 a.m.
                        
                            
                                (ends 12:30 PM)
                                    
                                8:30 a.m.
                        
                            
                                Orals 8:30-10:10
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:30]
                                                                        
                                                                    
                                                                    On the Topology Awareness and Generalization Performance of Graph Neural Networks
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:40]
                                                                        
                                                                    
                                                                    Improving Knowledge Distillation via Regularizing Feature Direction and Norm
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:50]
                                                                        
                                                                    
                                                                    Spline-based Transformers
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    Anytime Continual Learning for Open Vocabulary Classification
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    Weighted Ensemble Models Are Strong Continual Learners
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    COD: Learning Conditional Invariant Representation for Domain Adaptation Regression
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Echoes of the Past: Boosting Long-tail Recognition via Reflective Learning
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Chameleon: A Data-Efficient Generalist for Dense Visual Prediction in the Wild
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    Mamba-ND: Selective State Space Modeling for Multi-Dimensional Data
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    HiT-SR: Hierarchical Transformer for Efficient Image Super-Resolution
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Orals 8:30-10:00
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:30]
                                                                        
                                                                    
                                                                    Prompt-Driven Contrastive Learning for Transferable Adversarial Attacks
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:40]
                                                                        
                                                                    
                                                                    Adversarial Robustification via Text-to-Image Diffusion Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:50]
                                                                        
                                                                    
                                                                    Flatness-aware Sequential Learning Generates Resilient Backdoors
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    A Closer Look at GAN Priors: Exploiting Intermediate Features for Enhanced Model Inversion Attacks
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    Learning a Dynamic Privacy-preserving Camera Robust to Inversion Attacks
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    R.A.C.E.: Robust Adversarial Concept Erasure for Secure Text-to-Image Diffusion Model
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Privacy-Preserving Adaptive Re-Identification without Image Transfer
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    Images are Achilles' Heel of Alignment: Exploiting Visual Vulnerabilities for Jailbreaking Multimodal Large Language Models
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    Concept Arithmetics for Circumventing Concept Inhibition in Diffusion Models
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                Orals 8:30-10:10
                                                            
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:30]
                                                                        
                                                                    
                                                                    A Direct Approach to Viewing Graph Solvability
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:40]
                                                                        
                                                                    
                                                                    Convex Relaxations for Manifold-Valued Markov Random Fields with Approximation Guarantees
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [8:50]
                                                                        
                                                                    
                                                                    Flash Cache: Reducing Bias in Radiance Cache Based Inverse Rendering
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:00]
                                                                        
                                                                    
                                                                    A Riemannian Approach for Spatiotemporal Analysis and Generation of 4D Tree-shaped Structures
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:10]
                                                                        
                                                                    
                                                                    Physics-Based Interaction with 3D Objects via Video Generation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:20]
                                                                        
                                                                    
                                                                    Shape from Heat Conduction
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:30]
                                                                        
                                                                    
                                                                    Rasterized Edge Gradients: Handling Discontinuities Differentially
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:40]
                                                                        
                                                                    
                                                                    ControlNet-XS: Rethinking the Control of Text-to-Image Diffusion Models as Feedback-Control Systems
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [9:50]
                                                                        
                                                                    
                                                                    Parrot: Pareto-optimal Multi-Reward Reinforcement Learning Framework for Text-to-Image Generation
                                                                    
                                                                
                                                            
                                                                
                                                                    
                                                                    
                                                                        
                                                                            [10:00]
                                                                        
                                                                    
                                                                    Model Stock: All we need is just a few fine-tuned models
                                                                    
                                                                
                                                            
                                                        (ends 10:30 AM)
                                10:30 a.m.
                        
                            
                                (ends 12:30 PM)