Merge pull request #14 from catalystneuro/add_convert_all_sessions

Add convert all sessions script
catalystneuro · Jul 11, 2024 · 77c9573 · 77c9573
2 parents 7a67ae8 + d811ce2
commit 77c9573
Show file tree

Hide file tree

Showing 3 changed files with 234 additions and 11 deletions.
diff --git a/src/howe_lab_to_nwb/vu2024/extractors/cxdimagingextractor.py b/src/howe_lab_to_nwb/vu2024/extractors/cxdimagingextractor.py
@@ -163,12 +163,15 @@ def __init__(
 
         with aicsimageio.readers.bioformats_reader.BioFile(file_path) as reader:
             self._video = reader.to_dask()
-            if frame_indices is not None:
-                assert len(frame_indices) > 0, "frame_indices must be a non-empty list."
-                assert (
-                    len(frame_indices) <= self._num_frames
-                ), "frame_indices must be less than or equal to the number of frames."
-                self._video = self._video[frame_indices, ...]
+
+        if frame_indices is not None:
+            assert len(frame_indices) > 0, "frame_indices must be a non-empty list."
+            assert (
+                len(frame_indices) <= self._num_frames
+            ), "frame_indices must be less than or equal to the number of frames."
+            self._video = self._video[frame_indices, ...]
+            self._times = self._times[frame_indices]
+            self._num_frames = len(frame_indices)
 
         super().__init__(file_path=file_path, channel_name=channel_name, plane_name=plane_name)
 

diff --git a/src/howe_lab_to_nwb/vu2024/vu2024_convert_all_sessions.py b/src/howe_lab_to_nwb/vu2024/vu2024_convert_all_sessions.py
@@ -0,0 +1,202 @@
+import re
+from pathlib import Path
+
+import pandas as pd
+from neuroconv.utils import FilePathType, FolderPathType
+from nwbinspector import inspect_all
+from nwbinspector.inspector_tools import save_report, format_messages
+from tqdm import tqdm
+
+from howe_lab_to_nwb.vu2024.vu2024_convert_dual_wavelength_session import (
+    dual_wavelength_session_to_nwb,
+    single_wavelength_session_to_nwb,
+)
+
+
+def convert_all_sessions(
+    data_table_path: FilePathType,
+    folder_path: FolderPathType,
+    nwbfile_folder_path: FolderPathType,
+    stub_test: bool = False,
+    overwrite: bool = False,
+):
+    """
+    Convert all sessions from the Vu 2024 dataset to NWB format.
+
+    Parameters
+    ----------
+    data_table_path : FilePathType
+        The path to the XLSX file containing info for all the sessions.
+    folder_path : FolderPathType
+        The root folder path to search for the filenames in the data table.
+    nwbfile_folder_path : FolderPathType
+        The folder path to save the NWB files.
+    stub_test : bool, optional
+        Whether to run the conversion as a stub test.
+        When set to True, write only a subset of the data for each session.
+        When set to False, write the entire data for each session.
+    overwrite : bool, optional
+        Whether to overwrite existing NWB files, default is False.
+
+    """
+
+    data_table = pd.read_excel(data_table_path)
+    folder_path = Path(folder_path)
+
+    columns_to_group = ["Mouse", "Date (YYYYMMDD)"]
+    total_file_paths = data_table.groupby(columns_to_group).count().shape[0]
+    progress_bar = tqdm(
+        data_table.groupby(columns_to_group),
+        desc=f"Converting {total_file_paths} sessions to NWB ...",
+        position=0,
+        total=total_file_paths,
+        dynamic_ncols=True,
+    )
+
+    for (subject_id, date), table in progress_bar:
+        experiment_type = "single-wavelength" if len(table) == 1 else "dual-wavelength"
+        subject_id = str(subject_id).replace("-", "")
+
+        subject_folder_path = folder_path / subject_id
+
+        behavior_file_name = table["Raw behavior file"].values[0]
+        ttl_file_paths = list(subject_folder_path.rglob(behavior_file_name))
+        if not len(ttl_file_paths):
+            raise FileNotFoundError(f"Behavior file not found: {behavior_file_name}")
+
+        session_folder_path = ttl_file_paths[0].parent
+
+        imaging_file_paths = list(session_folder_path.glob("*.cxd"))
+        if not len(imaging_file_paths):
+            raise FileNotFoundError(f"No .cxd files found in {session_folder_path}")
+
+        ttl_stream_names = []
+        behavior_file_paths = []
+        for processed_behavior_file_name in table["Processed behavior file"]:
+            pattern = re.compile(r"(ttlIn1|ttlIn2)")
+            # Search for the pattern in the provided string
+            match = pattern.search(processed_behavior_file_name)
+            ttl_stream_name = match.group(0) if match else None
+            if ttl_stream_name is None:
+                raise ValueError(f"TTL stream name (ttlIn1, ttlIn2) not found in {processed_behavior_file_name}")
+            ttl_stream_names.append(ttl_stream_name)
+
+            behavior_file_paths.append(session_folder_path / processed_behavior_file_name)
+
+        fiber_photometry_file_paths = list(session_folder_path.glob("*_ROIs*.mat"))
+
+        ttl_file_path = imaging_file_paths[0].parent / table["Raw behavior file"].values[0]
+
+        ttl_stream_names = []
+        behavior_file_paths = []
+        for processed_behavior_file_name in table["Processed behavior file"]:
+            pattern = re.compile(r"(ttlIn1|ttlIn2)")
+            match = pattern.search(processed_behavior_file_name)
+            ttl_stream_name = match.group(0) if match else None
+            if ttl_stream_name is None:
+                raise ValueError(f"TTL stream name (ttlIn1, ttlIn2) not found in {processed_behavior_file_name}")
+            ttl_stream_names.append(ttl_stream_name)
+
+            behavior_file_paths.append(imaging_file_paths[0].parent / processed_behavior_file_name)
+
+        if len(imaging_file_paths) < len(table):
+            # In this case we have dual wavelength imaging data in the same file
+            # We will use the frame indices in dual_wavelength_session_to_nwb to determine which excitation wavelength the frame belongs to
+            imaging_file_paths = imaging_file_paths * len(table)
+        motion_corrected_file_paths = list(session_folder_path.glob("*.tif"))
+        if not len(motion_corrected_file_paths):
+            motion_corrected_file_paths = [None] * len(imaging_file_paths)
+        if len(motion_corrected_file_paths) != len(imaging_file_paths):
+            raise ValueError("The number of motion corrected imaging files must match the number of raw imaging files.")
+
+        fiber_locations_file_path = imaging_file_paths[0].parent.parent / f"{subject_id}_fiber_locations.xlsx"
+        if not fiber_locations_file_path.exists():
+            raise FileNotFoundError(f"Fiber locations file not found: {fiber_locations_file_path}")
+
+        excitation_wavelengths_in_nm = table["LED excitation wavelength (nm)"].values
+        indicators = []
+        for aav_string in table["Relevant injected sensor"].values:
+            pattern = re.compile(r"(dLight1\.3b|GCaMP7f|Ach3\.0|jRGECO1a)")
+            # Search for the pattern in the provided string
+            match = pattern.search(aav_string)
+            # If a match is found, return the matched string; otherwise, return None
+            indicator = match.group(0) if match else None
+            if indicator is None:
+                raise ValueError(f"Indicator (dLight1.3b, GCaMP7f, Ach3.0, jRGECO1a) not found in {aav_string}")
+            indicators.append(indicator)
+
+        nwbfile_path = nwbfile_folder_path / f"{subject_id}-{date}.nwb"
+        if nwbfile_path.exists() and not overwrite:
+            progress_bar.update(1)
+            continue
+
+        progress_bar.set_description(f"Converting subject '{subject_id}' session '{date}' session to NWB ...")
+        if experiment_type == "dual-wavelength":
+            dual_wavelength_session_to_nwb(
+                raw_imaging_file_paths=imaging_file_paths,
+                motion_corrected_imaging_file_paths=motion_corrected_file_paths,
+                fiber_photometry_file_paths=fiber_photometry_file_paths,
+                fiber_locations_file_path=fiber_locations_file_path,
+                excitation_wavelengths_in_nm=excitation_wavelengths_in_nm,
+                indicators=indicators,
+                ttl_file_path=ttl_file_path,
+                ttl_stream_names=ttl_stream_names,
+                behavior_file_paths=behavior_file_paths,
+                nwbfile_path=nwbfile_path,
+                stub_test=stub_test,
+            )
+        elif experiment_type == "single-wavelength":
+            single_wavelength_session_to_nwb(
+                raw_imaging_file_path=imaging_file_paths[0],
+                motion_corrected_imaging_file_path=motion_corrected_file_paths[0],
+                raw_fiber_photometry_file_path=fiber_photometry_file_paths[0],
+                fiber_locations_file_path=fiber_locations_file_path,
+                excitation_wavelength_in_nm=excitation_wavelengths_in_nm[0],
+                indicator=indicators[0],
+                ttl_file_path=ttl_file_path,
+                ttl_stream_name=ttl_stream_names[0],
+                behavior_file_path=behavior_file_paths[0],
+                nwbfile_path=nwbfile_path,
+                stub_test=stub_test,
+            )
+        progress_bar.update(1)
+
+    report_path = nwbfile_folder_path / "inspector_result.txt"
+    if not report_path.exists():
+        results = list(inspect_all(path=nwbfile_folder_path))
+        save_report(
+            report_file_path=report_path,
+            formatted_messages=format_messages(
+                results,
+                levels=["importance", "file_path"],
+            ),
+        )
+
+
+if __name__ == "__main__":
+    # Parameters for conversion
+
+    # The path to the XLSX file containing info for all the sessions
+    data_table_excel_file_path = Path("/Volumes/t7-ssd/Howe/data table.xlsx")
+
+    # The root folder path to search for the filenames in the data table
+    folder_path = Path("/Volumes/t7-ssd/Howe")
+
+    # The folder path to save the NWB files
+    nwbfile_folder_path = Path("/Volumes/t7-ssd/Howe/nwbfiles")
+
+    # Whether to overwrite existing NWB files, default is False
+    overwrite = False
+
+    # Whether to run the conversion as a stub test
+    # When set to True, write only a subset of the data for each session
+    # When set to False, write the entire data for each session
+    stub_test = False
+
+    convert_all_sessions(
+        data_table_path=data_table_excel_file_path,
+        folder_path=folder_path,
+        nwbfile_folder_path=nwbfile_folder_path,
+        stub_test=stub_test,
+        overwrite=overwrite,
+    )
diff --git a/src/howe_lab_to_nwb/vu2024/vu2024nwbconverter.py b/src/howe_lab_to_nwb/vu2024/vu2024nwbconverter.py
@@ -1,10 +1,12 @@
 """Primary NWBConverter class for this dataset."""
+from typing import Optional
 
 from neuroconv import NWBConverter
 from neuroconv.datainterfaces import TiffImagingInterface, VideoInterface
 from neuroconv.tools.signal_processing import get_rising_frames_from_ttl
 from neuroconv.utils import DeepDict
 from pymatreader import read_mat
+from pynwb import NWBFile
 
 from howe_lab_to_nwb.vu2024.interfaces import (
     CxdImagingInterface,
@@ -17,6 +19,8 @@
 class Vu2024NWBConverter(NWBConverter):
     """Primary conversion class for the Vu 2024 fiber photometry dataset."""
 
+    aligned = False
+
     data_interface_classes = dict(
         Imaging=CxdImagingInterface,
         ProcessedImaging=TiffImagingInterface,
@@ -49,11 +53,19 @@ def get_metadata(self) -> DeepDict:
         return metadata
 
     def temporally_align_data_interfaces(self):
+        if self.aligned:
+            return
         imaging = self.data_interface_objects["Imaging"]
         fiber_photometry = self.data_interface_objects["FiberPhotometry"]
         # Use the timestamps from the fiber photometry interface for the imaging data
         # The timestamps from the fiber photometry data is from the TTL signals
         fiber_photometry_timestamps = fiber_photometry.get_timestamps()
+
+        if imaging.source_data["frame_indices"] is not None:
+            # we must fix the timestamps for fiber photometry
+            frame_indices = imaging.source_data["frame_indices"]
+            fiber_photometry_timestamps = fiber_photometry_timestamps[frame_indices]
+            fiber_photometry.set_aligned_timestamps(aligned_timestamps=fiber_photometry_timestamps)
         imaging.set_aligned_timestamps(aligned_timestamps=fiber_photometry_timestamps)
 
         video_interfaces = [self.data_interface_objects[key] for key in self.data_interface_objects if "Video" in key]
@@ -66,10 +78,16 @@ def temporally_align_data_interfaces(self):
             else:
                 raise ValueError(f"Could not determine TTL stream for video file {video_file_path}.")
 
-            video_timestamps = video_interface.get_timestamps()
             ttl_file_path = fiber_photometry.source_data["ttl_file_path"]
             ttl_data = read_mat(filename=ttl_file_path)
-            first_ttl_frame = get_rising_frames_from_ttl(trace=ttl_data[ttl_stream_name])[0]
-            video_interface.set_aligned_segment_starting_times(
-                aligned_segment_starting_times=[video_timestamps[0][first_ttl_frame]]
-            )
+            rising_frames = get_rising_frames_from_ttl(trace=ttl_data[ttl_stream_name])
+            video_timestamps = ttl_data["timestamp"][rising_frames]
+            if len(video_timestamps) > len(fiber_photometry_timestamps):
+                video_timestamps = video_timestamps[: len(fiber_photometry_timestamps)]
+            video_interface.set_aligned_timestamps(aligned_timestamps=[video_timestamps])
+
+        self.aligned = True
+
+    def add_to_nwbfile(self, nwbfile: NWBFile, metadata, conversion_options: Optional[dict] = None) -> None:
+        self.temporally_align_data_interfaces()
+        return super().add_to_nwbfile(nwbfile=nwbfile, metadata=metadata, conversion_options=conversion_options)