Source code for histolab.data

# encoding: utf-8

# ================This module is completely inspired by scikit-image================
# https://github.com/scikit-image/scikit-image/blob/master/skimage/data/__init__.py
# ==================================================================================

import os
import shutil
from typing import Tuple

import openslide
import PIL
from requests.exceptions import HTTPError

from .. import __version__
from ._registry import legacy_registry, registry, registry_urls

legacy_data_dir = os.path.abspath(os.path.dirname(__file__))
histolab_distribution_dir = os.path.join(legacy_data_dir, "..")

try:
    from pooch import file_hash
except ModuleNotFoundError:
    # Function taken from
    # https://github.com/fatiando/pooch/blob/master/pooch/utils.py
[docs] def file_hash(fname: str, alg: str = "sha256") -> str: """Calculate the hash of a given file. Useful for checking if a file has changed or been corrupted. Parameters ---------- fname : str The name of the file. alg : str The type of the hashing algorithm Returns ------- hash : str The hash of the file. Examples -------- >>> fname = "test-file-for-hash.txt" >>> with open(fname, "w") as f: ... __ = f.write("content of the file") >>> print(file_hash(fname)) 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00 >>> import os >>> os.remove(fname) """ import hashlib if alg not in hashlib.algorithms_available: raise ValueError("Algorithm '{}' not available in hashlib".format(alg)) # Calculate the hash in chunks to avoid overloading the memory chunksize = 65536 hasher = hashlib.new(alg) with open(fname, "rb") as fin: buff = fin.read(chunksize) while buff: hasher.update(buff) buff = fin.read(chunksize) return hasher.hexdigest()
def _create_image_fetcher(): try: from pooch import create, os_cache except ImportError: # Without pooch, fallback on the standard data directory # which for now, includes a few limited data samples return None, legacy_data_dir pooch_version = __version__.replace(".dev", "+") url = "https://github.com/histolab/histolab/raw/{version}/histolab/" # Create a new friend to manage your sample data storage image_fetcher = create( # Pooch uses appdirs to select an appropriate directory for the cache # on each platform. # https://github.com/ActiveState/appdirs # On linux this converges to # '$HOME/.cache/histolab-image' # With a version qualifier path=os_cache("histolab-images"), base_url=url, version=pooch_version, env="HISTOLAB_DATADIR", registry=registry, urls=registry_urls, ) data_dir = os.path.join(str(image_fetcher.abspath), "data") return image_fetcher, data_dir image_fetcher, data_dir = _create_image_fetcher() if image_fetcher is None: HAS_POOCH = False else: HAS_POOCH = True def _has_hash(path: str, expected_hash: str) -> bool: """Check if the provided path has the expected hash. Parameters ---------- path: str expected_hash: str Returns ------- bool True if the file hash and the expected one are equal """ if not os.path.exists(path): return False return file_hash(path) == expected_hash def _fetch(data_filename: str) -> str: """Fetch a given data file from either the local cache or the repository. This function provides the path location of the data file given its name in the histolab repository. Parameters ---------- data_filename: str Name of the file in the histolab repository. e.g. 'breast/sample1.svs'. Returns ------- resolved_path: str Path of the local file Raises ------ KeyError: If the filename is not known to the histolab distribution. ModuleNotFoundError: If the filename is known to the histolab distribution but pooch is not installed. ConnectionError: If the dataset has not been downloaded yet and histolab is unable to connect to the internet """ resolved_path = os.path.join(data_dir, "..", data_filename) expected_hash = registry[data_filename] # Case 1: # The file may already be in the data_dir. # We may have decided to ship it in the histolab distribution. if _has_hash(resolved_path, expected_hash): # Nothing to be done, file is where it is expected to be return resolved_path # Case 2: # The user is using a cloned version of the github repo, which # contains both the publicly shipped data, and test data. # In this case, the file would be located relative to the # histolab_distribution_dir gh_repository_path = os.path.join(histolab_distribution_dir, data_filename) if _has_hash(gh_repository_path, expected_hash): parent = os.path.dirname(resolved_path) os.makedirs(parent, exist_ok=True) shutil.copy2(gh_repository_path, resolved_path) return resolved_path # Case 3: # Pooch not found. if image_fetcher is None: raise ModuleNotFoundError( "The requested file is part of the histolab distribution, " "but requires the installation of an optional dependency, pooch. " "To install pooch, use your preferred python package manager. " "Follow installation instruction found at " "https://www.fatiando.org/pooch/latest/install.html" ) # Case 4: # Pooch needs to download the data. Let the image fetcher search for # our data. A ConnectionError is raised if no internet connection is # available. try: resolved_path = image_fetcher.fetch(data_filename) except HTTPError as httperror: raise HTTPError(f"{httperror}") except ConnectionError: # pragma: no cover # If we decide in the future to suppress the underlying 'requests' # error, change this to `raise ... from None`. See PEP 3134. raise ConnectionError( "Tried to download a histolab dataset, but no internet " "connection is available." ) return resolved_path def _init_pooch() -> None: os.makedirs(data_dir, exist_ok=True) # Fetch all legacy data so that it is available by default for filename in legacy_registry: _fetch(filename) if HAS_POOCH: _init_pooch() def _load_svs(filename: str) -> Tuple[openslide.OpenSlide, str]: """Load an image file located in the data directory. Parameters ---------- filename : str Name of the file in the histolab repository Returns ------- slide : openslide.OpenSlide An OpenSlide object representing a whole-slide image. path : str Path where the slide is saved Raises ------ OpenSlideError: OpenSlide cannot open the given input """ try: svs = openslide.open_slide(_fetch(filename)) except PIL.UnidentifiedImageError: raise PIL.UnidentifiedImageError( "Your wsi has something broken inside, a doctor is needed" ) return svs, _fetch(filename)
[docs]def aorta_tissue() -> Tuple[openslide.OpenSlide, str]: # pragma: no cover """aorta_tissue() -> Tuple[openslide.OpenSlide, str] Aorta tissue, brightfield, JPEG 2000, YCbCr This image is available here http://openslide.cs.cmu.edu/download/openslide-testdata/Aperio/ Free to use and distribute, with or without modification Returns ------- aorta_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of aortic tissue. path : str Path where the slide is saved """ return _load_svs("aperio/JP2K-33003-1.svs")
[docs]def breast_tissue() -> Tuple[openslide.OpenSlide, str]: # pragma: no cover """breast_tissue() -> Tuple[openslide.OpenSlide, str] Breast tissue, TCGA-BRCA dataset. This image is available here https://portal.gdc.cancer.gov/files/ad9ed74a-2725-49e6-bf7a-ef100e299989 or through the API https://api.gdc.cancer.gov/data/ad9ed74a-2725-49e6-bf7a-ef100e299989 It corresponds to TCGA file `TCGA-A8-A082-01A-01-TS1.3cad4a77-47a6-4658-becf-d8cffa161d3a.svs` Access: open Returns ------- breast_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of breast tissue. path : str Path where the slide is saved """ return _load_svs( "tcga/breast/TCGA-A8-A082-01A-01-TS1.3cad4a77-47a6-4658-becf-d8cffa161d3a.svs" )
[docs]def breast_tissue_diagnostic_green_pen() -> Tuple[ openslide.OpenSlide, str ]: # pragma: no cover """breast_tissue_diagnostic_green_pen() -> Tuple[openslide.OpenSlide, str] Breast tissue, TCGA-BRCA dataset. Diagnostic slide with green pen marks. This image is available here https://portal.gdc.cancer.gov/files/3845b8bd-cbe0-49cf-a418-a8120f6c23db or through the API https://api.gdc.cancer.gov/data/3845b8bd-cbe0-49cf-a418-a8120f6c23db It corresponds to TCGA file `TCGA-A1-A0SH-01Z-00-DX1.90E71B08-E1D9-4FC2-85AC-062E56DDF17C.svs` Access: open Returns ------- breast_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of breast tissue with green pen marks. path : str Path where the slide is saved """ return _load_svs( "tcga/breast/TCGA-A1-A0SH-01Z-00-DX1.90E71B08-E1D9-4FC2-85AC-062E56DDF17C.svs" )
[docs]def breast_tissue_diagnostic_red_pen() -> Tuple[ openslide.OpenSlide, str ]: # pragma: no cover """breast_tissue_diagnostic_red_pen() -> Tuple[openslide.OpenSlide, str] Breast tissue, TCGA-BRCA dataset. Diagnostic slide with red pen marks. This image is available here https://portal.gdc.cancer.gov/files/682e4d74-2200-4f34-9e96-8dee968b1568 or through the API https://api.gdc.cancer.gov/data/682e4d74-2200-4f34-9e96-8dee968b1568 It corresponds to TCGA file `TCGA-E9-A24A-01Z-00-DX1.F0342837-5750-4172-B60D-5F902E2A02FD.svs` Access: open Returns ------- breast_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of breast tissue with red pen marks. path : str Path where the slide is saved """ return _load_svs( "tcga/breast/TCGA-E9-A24A-01Z-00-DX1.F0342837-5750-4172-B60D-5F902E2A02FD.svs" )
[docs]def breast_tissue_diagnostic_black_pen() -> Tuple[ openslide.OpenSlide, str ]: # pragma: no cover """breast_tissue_diagnostic_black_pen() -> Tuple[openslide.OpenSlide, str] Breast tissue, TCGA-BRCA dataset. Diagnostic slide with black pen marks. This image is available here https://portal.gdc.cancer.gov/files/e70c89a5-1c2f-43f8-b6be-589beea55338 or through the API https://api.gdc.cancer.gov/data/e70c89a5-1c2f-43f8-b6be-589beea55338 It corresponds to TCGA file `TCGA-BH-A201-01Z-00-DX1.6D6E3224-50A0-45A2-B231-EEF27CA7EFD2.svs` Access: open Returns ------- breast_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of breast tissue with green black marks. path : str Path where the slide is saved """ return _load_svs( "tcga/breast/TCGA-BH-A201-01Z-00-DX1.6D6E3224-50A0-45A2-B231-EEF27CA7EFD2.svs" )
[docs]def cmu_small_region() -> Tuple[openslide.OpenSlide, str]: """cmu_small_region() -> Tuple[openslide.OpenSlide, str] Carnegie Mellon University MRXS sample tissue This image is available here http://openslide.cs.cmu.edu/download/openslide-testdata/Aperio/ Licensed under a CC0 1.0 Universal (CC0 1.0) Public Domain Dedication. Returns ------- cmu_mrxs_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of small tissue region. path : str Path where the slide is saved """ return _load_svs("data/cmu_small_region.svs")
[docs]def heart_tissue() -> Tuple[openslide.OpenSlide, str]: # pragma: no cover """heart_tissue() -> Tuple[openslide.OpenSlide, str] Heart tissue, brightfield, JPEG 2000, YCbCr This image is available here http://openslide.cs.cmu.edu/download/openslide-testdata/Aperio/ Free to use and distribute, with or without modification Returns ------- heart_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of heart tissue. path : str Path where the slide is saved """ return _load_svs("aperio/JP2K-33003-2.svs")
[docs]def ihc_breast() -> Tuple[openslide.OpenSlide, str]: # pragma: no cover """ihc_breast() -> Tuple[openslide.OpenSlide, str] Breast cancer resection, staining CD3 (brown) and CD20 (red). This image is available here https://idr.openmicroscopy.org/ under accession number idr0073, ID `breastCancer12`. Returns ------- ihc_breast : openslide.OpenSlide IHC-stained Whole-Slide-Image of Breast tissue. path : str Path where the slide is saved """ return _load_svs("9798433/?format=tif")
[docs]def ihc_kidney() -> Tuple[openslide.OpenSlide, str]: # pragma: no cover """ihc_kidney() -> Tuple[openslide.OpenSlide, str] Renal allograft, staining CD3 (brown) and CD20 (red). This image is available here https://idr.openmicroscopy.org/ under accession number idr0073, ID `kidney_46_4`. Returns ------- ihc_kidney : openslide.OpenSlide IHC-stained Whole-Slide-Image of kidney tissue. path : str Path where the slide is saved """ return _load_svs("9798554/?format=tif")
[docs]def ovarian_tissue() -> Tuple[openslide.OpenSlide, str]: # pragma: no cover """ovarian_tissue() -> Tuple[openslide.OpenSlide, str] tissue of Ovarian Serous Cystadenocarcinoma, TCGA-OV dataset. This image is available here https://portal.gdc.cancer.gov/files/e968375e-ef58-4607-b457-e6818b2e8431 or through the API https://api.gdc.cancer.gov/data/e968375e-ef58-4607-b457-e6818b2e8431 It corresponds to TCGA file `CGA-13-1404-01A-01-TS1.cecf7044-1d29-4d14-b137-821f8d48881e.svs` Access: open Returns ------- prostate_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of ovarian tissue. path : str Path where the slide is saved """ return _load_svs( "tcga/ovarian/TCGA-13-1404-01A-01-TS1.cecf7044-1d29-4d14-b137-821f8d48881e.svs" )
[docs]def prostate_tissue() -> Tuple[openslide.OpenSlide, str]: # pragma: no cover """prostate_tissue() -> Tuple[openslide.OpenSlide, str] tissue of Prostate Adenocarcinoma, TCGA-PRAD dataset. This image is available here https://portal.gdc.cancer.gov/files/5a8ce04a-0178-49e2-904c-30e21fb4e41e or through the API https://api.gdc.cancer.gov/data/5a8ce04a-0178-49e2-904c-30e21fb4e41e It corresponds to TCGA file `TCGA-CH-5753-01A-01-BS1.4311c533-f9c1-4c6f-8b10-922daa3c2e3e.svs` Access: open Returns ------- prostate_tissue : openslide.OpenSlide H&E-stained Whole-Slide-Image of prostate tissue. path : str Path where the slide is saved """ return _load_svs( "tcga/prostate/TCGA-CH-5753-01A-01-BS1.4311c533-f9c1-4c6f-8b10-922daa3c2e3e.svs" )