We propose Uni-Hand, a unified framework for multimodal hand motion forecasting that simultaneously addresses: (1) 2D/3D waypoint prediction, (2) hand-head motion synergy in egocentric vision, and (3) task-aware affordance learning. Our approach integrates vision-language fusion with global context/text instruction injection through a novel dual-branch diffusion architecture. The framework uniquely supports targeted joint-level prediction (wrist/fingers) beyond conventional center-point forecasting, while additionally inferring hand-object interaction states (contact/separation) to enhance downstream applications.
@misc{ma2025mmtwin,
title={Novel Diffusion Models for Multimodal 3D Hand Trajectory Prediction},
author={Junyi Ma and Wentao Bao and Jingyi Xu and Guanzhong Sun and Xieyuanli Chen and Hesheng Wang},
year={2025},
eprint={2504.07375},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2504.07375},
}
@misc{ma2025unihand,
title={Uni-Hand: Universal Hand Motion Forecasting in Egocentric Views},
author={Junyi Ma and Wentao Bao and Jingyi Xu and Guanzhong Sun and Yu Zheng and Erhang Zhang and Xieyuanli Chen and Hesheng Wang},
year={2025},
eprint={2511.12878},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2511.12878},
}
}