From abf5efbdea645585d0913d771a333680ebb480cb Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 26 Sep 2024 11:05:39 -0400 Subject: [PATCH 1/2] Fix more HDF4 - NDG groups - null terminated strings --- kerchunk/hdf4.py | 66 +++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py index 449e4e0..9624397 100644 --- a/kerchunk/hdf4.py +++ b/kerchunk/hdf4.py @@ -105,9 +105,12 @@ def translate(self, filename=None, storage_options=None): # dtype = dtypes[info["types"][0]] inf2 = self.tags[("VS", ref)] self.f.seek(inf2["offset"]) - data = self.f.read(inf2["length"]) + # remove zero padding + data = self.f.read(inf2["length"]).split(b"\x00", 1)[0] # NASA conventions - if info["name"].startswith(("CoreMetadata.", "ArchiveMetadata.")): + if info["name"].startswith( + ("CoreMetadata.", "ArchiveMetadata.", "StructMetadata.") + ): obj = None for line in data.decode().split("\n"): if "OBJECT" in line: @@ -132,7 +135,7 @@ def translate(self, filename=None, storage_options=None): roots.add((tag, ref)) # hierarchical output - output = self._descend_vg(*list(roots)[0]) + output = self._descend_vg(*sorted(roots, key=lambda t: t[1])[-1]) prot = fo.fs.protocol prot = prot[0] if isinstance(prot, tuple) else prot fs = fsspec.filesystem( @@ -167,9 +170,14 @@ def translate(self, filename=None, storage_options=None): ) ) for r in v.get("refs", []): + if r[0] == "DEFLATE": + continue refs[f"{k}/{r[0]}"] = [self.path, r[1], r[2]] else: - attrs[k] = v + if not k.startswith( + ("CoreMetadata.", "ArchiveMetadata.", "StructMetadata.") + ): + attrs[k] = v fs.references.update(refs) g.attrs.update(attrs) @@ -185,7 +193,7 @@ def _descend_vg(self, tag, ref): inf2 = self.tags[(t, r)] if t == "VG": tmp = self._descend_vg(t, r) - if list(tmp)[0] == inf2["name"]: + if tmp and list(tmp)[0] == inf2["name"]: tmp = tmp[inf2["name"]] out[inf2["name"]] = tmp elif t == "VH": @@ -196,19 +204,27 @@ def _descend_vg(self, tag, ref): self.f.seek(inf2["offset"]) data = self.f.read(inf2["length"]) if dtype == "str": - out[name] = data.decode().lstrip('"').rstrip('"') # decode() ? + out[name] = ( + data.split(b"\x00", 1)[0].decode().lstrip('"').rstrip('"') + ) # decode() ? else: out[name] = np.frombuffer(data, dtype)[0] elif t == "NT": out["dtype"] = inf2["typ"] elif t == "SD": - out["refs"] = inf2["data"][:-1] - out["chunks"] = [_["chunk_length"] for _ in inf2["data"][-1]] + if isinstance(inf2["data"][-1], (tuple, list)): + out["refs"] = inf2["data"][:-1] + out["chunks"] = [_["chunk_length"] for _ in inf2["data"][-1]] + else: + out["refs"] = [inf2["data"]] + out["chunks"] = True elif t == "SDD": out["dims"] = inf2["dims"] - else: - # NDGs contain same info as NT, SD and SDD - pass + elif t == "NDG": + out.setdefault("extra", []).append(_dec_ndg(self, inf2)) + if out.get("chunks") is True: + out["chunks"] = out["dims"] + out["refs"] = [".".join(["0"] * len(out["dims"]))] + out["refs"] return out def _dec(self, tag, ref): @@ -326,13 +342,14 @@ def _dec_comp(self): @reg("NDG") def _dec_ndg(self, info): - # links together these things as a Data Group - return { - "tags": [ - (tags[self.read_int(2)], self.read_int(2)) - for _ in range(0, info["length"], 4) - ] - } + if "tags" not in info: + return { + "tags": [ + (tags[self.read_int(2)], self.read_int(2)) + for _ in range(0, info["length"], 4) + ] + } + return info["tags"] @reg("SDD") @@ -365,11 +382,14 @@ def _dec_vh(self, info): isize = [self.read_int(2) for _ in range(nfields)] offsets = [self.read_int(2) for _ in range(nfields)] order = [self.read_int(2) for _ in range(nfields)] - names = [self.f.read(self.read_int(2)).decode() for _ in range(nfields)] + names = [ + self.f.read(self.read_int(2)).split(b"\x00", 1)[0].decode() + for _ in range(nfields) + ] namelen = self.read_int(2) - name = self.f.read(namelen).decode() + name = self.f.read(namelen).split(b"\x00", 1)[0].decode() classlen = self.read_int(2) - cls = self.f.read(classlen).decode() + cls = self.f.read(classlen).split(b"\x00", 1)[0].decode() ref = (self.read_int(2), self.read_int(2)) return _pl(locals()) @@ -379,8 +399,8 @@ def _dec_vg(self, info): nelt = self.read_int(2) tag = [tags[self.read_int(2)] for _ in range(nelt)] refs = [self.read_int(2) for _ in range(nelt)] - name = self.f.read(self.read_int(2)).decode() - cls = self.f.read(self.read_int(2)).decode() + name = self.f.read(self.read_int(2)).split(b"\x00", 1)[0].decode() + cls = self.f.read(self.read_int(2)).split(b"\x00", 1)[0].decode() return _pl(locals()) From 825ec20df2b5b1ff4bc6b22125b4254c6e191fe7 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 26 Sep 2024 11:34:57 -0400 Subject: [PATCH 2/2] fix references --- kerchunk/hdf4.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py index 9624397..483ffba 100644 --- a/kerchunk/hdf4.py +++ b/kerchunk/hdf4.py @@ -221,10 +221,13 @@ def _descend_vg(self, tag, ref): elif t == "SDD": out["dims"] = inf2["dims"] elif t == "NDG": - out.setdefault("extra", []).append(_dec_ndg(self, inf2)) + pass # out.setdefault("extra", []).append(_dec_ndg(self, inf2)) if out.get("chunks") is True: out["chunks"] = out["dims"] - out["refs"] = [".".join(["0"] * len(out["dims"]))] + out["refs"] + out["refs"] = [ + [".".join(["0"] * len(out["dims"]))] + + [out["refs"][0][1], out["refs"][0][2], out["refs"][0][0]] + ] return out def _dec(self, tag, ref):