Source code for rushd.ddpcr

"""
Common functions for analyzing ddPCR data in Pandas Dataframes.

Extracts data and metadata from .ddpcr files.
Allows users to specify custom metadata applied via well mapping.
"""

import json
import re
import shutil
import tempfile
from pathlib import Path
from typing import Any, Dict, Optional, Union

import numpy as np
import pandas as pd
import py7zr

from . import flow


[docs]class YamlError(RuntimeError): """Error raised when there is an issue with the provided .yaml file."""
[docs]class DataPathError(RuntimeError): """Error raised when the path to the data is not specified correctly."""
[docs]def load_ddpcr_metadata(unzipped_path: Path) -> Dict[Any, Any]: """ Load well metadata from an unzipped .ddpcr file. Generates a metadata dict in the same format as the YAML well mapping, i.e., key -> {well -> value}. The columns are a subset of the metadata associated with each well in the BioRad software, namely sample names (numbered 'Sample description' fields, returned as numbered 'sample_description' keys) and targets for each channel/dye (returned as '[channel]_target' keys). Parameters ---------- unzipped_path: Path Path to unzipped .ddpcr file Returns ------- dict A dictionary that contains a well mapping for metadata extracted from the .ddpcr experiment. """ filename_regex = r"^.*[\\/](?P<well>[A-P]\d+)\.dd.*json" # Create map of well index -> ID well_id_map = {} for f in (unzipped_path / "PeakMetaData").glob("*.ddmetajson"): with open(f, "r") as file: d = json.load(file) well_id_map[d["WellIndex"]] = re.compile(filename_regex).match(file.name).group("well") # Get plate file name from last modified .ddplt file plate_file = "" last_mod_time = 0 for f in unzipped_path.glob("*.ddplt"): mtime = f.stat().st_mtime if mtime > last_mod_time: last_mod_time = mtime plate_file = f.name # Load metadata from plate file metadata_from_plt = {} with open(unzipped_path / plate_file, "r") as file: f = json.load(file) for w in f["WellSamples"]: well = well_id_map[w["WellIndex"]] condition_map = { f"sample_description_{i+1}": val for i, val in enumerate(w["SampleIds"]) } target_map = { p["Dye"]["DyeName"] + "_target": p["TargetName"] for p in w["Panel"]["Targets"] } metadata_from_plt[well] = {**condition_map, **target_map} metadata_map = pd.DataFrame.from_dict(metadata_from_plt, orient="index").to_dict() return metadata_map
[docs]def load_ddpcr( data_path: Union[str, Path], yaml_path: Optional[Union[str, Path]] = None, *, extract_metadata: Optional[bool] = True, ) -> pd.DataFrame: """ Load ddPCR data into DataFrame with associated metadata. Generates a pandas DataFrame from a .ddpcr file, which is the file type for experiments on the BioRad QX100/QX200 machines. Adds columns for metadata encoded by a given .yaml file. Metadata is associated with the data based on well IDs extracted from the experiment data. Parameters ---------- data_path: str or Path Path to .ddpcr file. yaml_path: str or Path (optional) Path to .yaml file to use for associating metadata with well IDs. All metadata must be contained under the header 'metadata'. extract_metadata: bool, default ``True`` Whether to extract metadata from the .ddpcr file. If ``True``, adds a subset of the metadata associated with each well in the BioRad software, namely sample names (numbered 'Sample description' fields, returned as numbered 'condition' keys) and targets for each channel/dye (returned as '[channel]_target' keys). Returns ------- DataFrame A single pandas DataFrame containing all data with associated metadata. """ if not isinstance(data_path, Path): data_path = Path(data_path) if data_path.suffix != ".ddpcr": raise DataPathError("'data_path' must be a .ddpcr file.") # Unzip .ddpcr file tmp_data_path = Path(tempfile.mkdtemp()) with py7zr.SevenZipFile( data_path, "r", password="1b53402e-503a-4303-bf86-71af1f3178dd" ) as experiment: experiment.extractall(path=tmp_data_path) metadata_map = {} # Load metadata from .yaml file if yaml_path is not None: try: metadata_map = flow.load_well_metadata(yaml_path) except FileNotFoundError as err: raise YamlError("Specified metadata YAML file does not exist!") from err # Load metadata from .ddpcr file if extract_metadata: metadata_map = {**metadata_map, **load_ddpcr_metadata(tmp_data_path)} # Load data for each well data_list = [] for f in (tmp_data_path / "PeakData").glob("*.ddpeakjson"): with open(f, "r") as file: d = json.load(file) # Ignore wells for which no data was collected if not d["DataAcquisitionInfo"]["WasAcquired"]: continue # Extract raw data (channel amplitude) and channel names channel_map = { c["Channel"] - 1: c["Dye"] for c in d["DataAcquisitionInfo"]["ChannelMap"] } df = pd.DataFrame(np.transpose(d["PeakInfo"]["Amplitudes"])).rename(columns=channel_map) well = f.stem df.insert(0, "well", [well] * len(df)) # Add metadata to DataFrame index = 0 for k, v in metadata_map.items(): # Replace custom metadata keys with <NA> if not present df.insert(index, k, v[well] if well in v else [pd.NA] * len(df)) index += 1 data_list.append(df) # Fill empty values with <NA> and drop empty columns data = ( pd.concat(data_list, ignore_index=True) .replace([float("nan"), np.nan, ""], pd.NA) .dropna(axis="columns", how="all") ) # Delete unzipped files shutil.rmtree(tmp_data_path) return data