tools/prepare_prostate.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The file structure is as following:
MRSpineSeg
|--MRI_train.zip
|--MRI_spine_seg_raw
│   └── MRI_train
│       └── train
│           ├── Mask
│           └── MR
├── MRI_spine_seg_phase0
│   ├── images
│   ├── labels
│   │   ├── Case129.npy
│   │   ├── ...
│   ├── train_list.txt
│   └── val_list.txt
└── MRI_train.zip

support:
1. download and uncompress the file.
2. save the normalized data as the above format.
3. split the training data and save the split result in train_list.txt and val_list.txt (we use all the data for training, since this is trainsplit)

"""
import os
import sys
import zipfile
import functools
import numpy as np

sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), ".."))

from prepare import Prep
from preprocess_utils import resample, normalize, label_remap
from medicalseg.utils import wrapped_partial

urls = {
    "Promise12": {
        "Promise12": ""
    },
    "Prostate_mri": {
        "Prostate_mri": ""
    },  # https://drive.google.com/file/d/1TtrjnlnJ1yqr5m4LUGMelKTQXtvZaru-/view?usp=sharing
}

dataset_addr = {
    "Promise12": {
        "dataset_root": "data/Promise12",
        "raw_dataset_dir": "Promise12_raw",
        "images_dir":
        ("prostate/TrainingData_Part1", "prostate/TrainingData_Part2",
         "prostate/TrainingData_Part3"),
        "labels_dir": ("prostate/TrainingData_Part1",
                       "prostate/TrainingData_Part2",
                       "prostate/TrainingData_Part3"),
        "images_dir_test": "prostate/TestData",
        "phase_dir": "Promise12_phase0/",
        "urls": urls["Promise12"],
        "valid_suffix": ("mhd", "mhd"),
        "filter_key": ({
            "segmentation": False
        }, {
            "segmentation": True
        }),
        "uncompress_params": {
            "format": "zip",
            "num_files": 1
        }
    },
    "Prostate_mri": {
        "dataset_root": "data/Prostate_mri",
        "raw_dataset_dir": "Prostate_mri_raw",
        "images_dir": ("Processed_data_nii/BIDMC", "Processed_data_nii/BMC",
                       "Processed_data_nii/HK", "Processed_data_nii/I2CVB",
                       "Processed_data_nii/RUNMC", "Processed_data_nii/UCL"),
        "labels_dir": ("Processed_data_nii/BIDMC", "Processed_data_nii/BMC",
                       "Processed_data_nii/HK", "Processed_data_nii/I2CVB",
                       "Processed_data_nii/RUNMC", "Processed_data_nii/UCL"),
        "phase_dir": "Prostate_mri_phase0/",
        "urls": urls["Prostate_mri"],
        "valid_suffix": ("nii.gz", "nii.gz"),
        "filter_key": ({
            "segmentation": False
        }, {
            "segmentation": True
        }),
        "uncompress_params": {
            "format": "zip",
            "num_files": 1
        }
    }
}

dataset_profile = {
    "Promise12": {
        "modalities": ('MRI-T2', ),
        "labels": {
            0: "Background",
            1: "prostate"
        },
        "dataset_name": "Promise12",
        "dataset_description":
        "These cases include a transversal T2-weighted MR image of the prostate. The training set is a representative set of the types of MR images acquired in a clinical setting. The data is multi-center and multi-vendor and has different acquistion protocols (e.g. differences in slice thickness, with/without endorectal coil). The set is selected such that there is a spread in prostate sizes and appearance. For each of the cases in the training set, a reference segmentation is also included.",
        "license_desc": "",
        "dataset_reference": "https://promise12.grand-challenge.org/Details/"
    },
    "Prostate_mri": {
        "modalities": ('MRI-T2', ),
        "labels": {
            0: "Background",
            1: "prostate"
        },
        "dataset_name": "Prostate_mri",
        "dataset_description":
        "This is a well-organized multi-site dataset for prostate MRI segmentation, which contains prostate T2-weighted MRI data (with segmentation mask) collected from six different data sources out of three public datasets. ",
        "license_desc": "",
        "dataset_reference": "https://liuquande.github.io/SAML/"
    }
}


class Prep_prostate(Prep):
    def __init__(self,
                 dataset_root="data/TemDataSet",
                 raw_dataset_dir="TemDataSet_seg_raw/",
                 images_dir="train_imgs",
                 labels_dir="train_labels",
                 phase_dir="phase0",
                 urls=None,
                 valid_suffix=("nii.gz", "nii.gz"),
                 filter_key=(None, None),
                 uncompress_params={"format": "zip",
                                    "num_files": 1},
                 images_dir_test=""):

        super().__init__(dataset_root, raw_dataset_dir, images_dir, labels_dir,
                         phase_dir, urls, valid_suffix, filter_key,
                         uncompress_params, images_dir_test)

        self.preprocess={"images":[           # todo: make params set automatically
                        normalize,
                        wrapped_partial(
                            resample, new_shape=[512, 512, 24],
                            order=1)],
                        "labels":[
                        wrapped_partial(
                            resample, new_shape=[512, 512, 24], order=0)],
                        "images_test":[normalize,]}

    def generate_txt(self, split=1.0):
        """generate the train_list.txt and val_list.txt"""

        txtname = [
            os.path.join(self.phase_path, 'train_list.txt'),
            os.path.join(self.phase_path, 'val_list.txt')
        ]

        if self.image_files_test:
            txtname.append(os.path.join(self.phase_path, 'test_list.txt'))
            test_file_npy = os.listdir(self.image_path_test)

        image_files_npy = os.listdir(self.image_path)
        label_files_npy = [
            name.replace(".npy", "_segmentation.npy")
            for name in image_files_npy  # to have the save order
        ]

        self.split_files_txt(
            txtname[0], image_files_npy, label_files_npy, split=split)
        self.split_files_txt(
            txtname[1], image_files_npy, label_files_npy, split=split)

        self.split_files_txt(txtname[2], test_file_npy)


if __name__ == "__main__":
    # Todo: Prostate_mri have files with same name in different dir, which caused file overlap problem.
    # Todo: MSD_prostate is not supported yet, because it has four channel and resample will have a bug.
    prep = Prep_prostate(**dataset_addr["Promise12"])
    prep.generate_dataset_json(**dataset_profile["Promise12"])
    prep.load_save()
    prep.generate_txt()