Skip to content

Commit 82467fb

Browse files
kew6688wangyukai
andauthored
[Fix] Update dataset conversion for InternData-N1 VLN-PE v0.5 dataset format (#288)
* update lerobot dataset * add compatibility doc * remove chinese comment --------- Co-authored-by: wangyukai <wangyukai@pjlab.org.cn>
1 parent 554fd97 commit 82467fb

5 files changed

Lines changed: 392 additions & 7 deletions

File tree

docs/compatibility.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Compatibility
2+
3+
## v0.3.1
4+
5+
### InternData-N1 update to v0.5
6+
7+
The InternData-N1 VLN-PE trajectory training dataset has been upgraded from `v0.1` to `v0.5`. This update introduces minor structural changes in the dataset layout and updates the LeRobot-to-LMDB conversion logic to match the new `v0.5` data structure.
8+
9+
The training pipeline now uses the new key name:
10+
- `instruction_text``task`
11+
12+
The updated conversion logic is **not compatible** with InternData-N1 `v0.1`.
13+

internnav/dataset/cma_lerobot_dataset.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from internnav.dataset.base import BaseDataset, ObservationsDict, _block_shuffle
88
from internnav.model.utils.feature_extract import extract_instruction_tokens
9-
from internnav.utils.lerobot_as_lmdb import LerobotAsLmdb
9+
from internnav.utils.loader import LerobotAsLmdb
1010

1111

1212
class CMALerobotDataset(BaseDataset):
@@ -38,8 +38,9 @@ def __init__(
3838
self.camera_name = self.config.il.camera_name
3939

4040
self.lerobot_as_lmdb = LerobotAsLmdb(self.lerobot_features_dir)
41-
self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys()
41+
self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys(allow_scan_list=['r2r']) # r2r / r2r_aliengo / r2r_flash
4242
self.length = len(self.lmdb_keys)
43+
print(f"total keys in traj_data: {len(self.lmdb_keys)}")
4344

4445
# For CMA-CLIP
4546
self.use_clip_encoders = False

internnav/dataset/rdp_lerobot_dataset.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from internnav.model.basemodel.LongCLIP.model import longclip
2727
from internnav.model.utils.feature_extract import extract_instruction_tokens
2828
from internnav.utils.geometry_utils import get_delta, normalize_data, to_local_coords
29-
from internnav.utils.lerobot_as_lmdb import LerobotAsLmdb
29+
from internnav.utils.loader import LerobotAsLmdb
3030

3131

3232
def _convert_image_to_rgb(image):
@@ -103,8 +103,9 @@ def __init__(
103103
self.to_pil = ToPILImage()
104104
self.image_processor = _transform(n_px=224) # copy from clip-long
105105
self.lerobot_as_lmdb = LerobotAsLmdb(self.lerobot_features_dir)
106-
self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys()
106+
self.lmdb_keys = self.lerobot_as_lmdb.get_all_keys(allow_scan_list=['r2r']) # r2r / r2r_aliengo / r2r_flash
107107
self.length = len(self.lmdb_keys)
108+
print(f"total keys in traj_data: {len(self.lmdb_keys)}")
108109

109110
self.start = 0
110111
self.end = self.length
@@ -192,7 +193,7 @@ def _load_next(self): # noqa: C901
192193
episodes_in_json = data_to_load['episodes_in_json']
193194

194195
instructions = [
195-
episodes_in_json[ep_idx]['instruction_text'][: self.config.model.text_encoder.max_length]
196+
episodes_in_json[ep_idx]['task'][: self.config.model.text_encoder.max_length]
196197
for ep_idx in range(len(episodes_in_json))
197198
]
198199

0 commit comments

Comments
 (0)