We would like to thank Qiushan Guo for assistance in setting up the project. We also appreciate the NVIDIA VILA team for the efforts in developing a robust framework that greatly facilitated our research.
@article{OmniRGPT,
title={Omni-RGPT: Unifying Image and Video Region-level Understanding via Token Marks},
author={Heo, Miran and Chen, Min-Hung and Huang, De-An and Liu, Sifei and Radhakrishnan, Subhashree and Kim, Seon Joo and Wang, Yu-Chiang Frank and Hachiuma, Ryo},
journal={arXiv preprint arXiv:2501.08326},
year={2025}
}