Module `earthvision.datasets.l7irish`

Landsat 7 Irish Cloud Dataset.

Expand source code

"""Landsat 7 Irish Cloud Dataset."""
from PIL import Image
import os
import shutil
import posixpath
import numpy as np
import pandas as pd
import glob
import requests

from typing import Any, Callable, Optional, Tuple
from .vision import VisionDataset
from .utils import _urlretrieve, _load_img
from bs4 import BeautifulSoup


class L7Irish(VisionDataset):
    """Landsat 7 Irish Cloud.

    <https://landsat.usgs.gov/landsat-7-cloud-cover-assessment-validation-data>

    Args:
        root (string): Root directory of dataset.
        transform (callable, optional): A function/transform that  takes in an PIL image and
            returns a transformed version. E.g, transforms.RandomCrop
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
    """

    mirrors = "http://landsat.usgs.gov/cloud-validation/cca_irish_2015/"

    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:

        super(L7Irish, self).__init__(root, transform=transform, target_transform=target_transform)

        self.root = root
        self.download_urls = self.get_download_url()
        self.resources = [url.split("/")[-1] for url in self.download_urls]
        self.data_modes = [filename.split(".tar.gz")[0] for filename in self.resources]

        if download and self._check_exists():
            print("file already exists.")

        if download and not self._check_exists():
            self.download()
            self.extract_file()

        self.img_labels = self.get_path_and_label()

    def get_download_url(self):
        """Get the urls to download the files."""
        page = requests.get(
            "https://landsat.usgs.gov/landsat-7-cloud-cover-assessment-validation-data"
        )
        soup = BeautifulSoup(page.content, "html.parser")

        urls = [url.get("href") for url in soup.find_all("a")]
        urls = list(filter(None, urls))

        download_urls = filter(lambda url: url.endswith(".gz"), urls)
        return download_urls

    def download(self):
        """Download file"""
        for resource in self.resources:
            file_url = posixpath.join(self.mirrors, resource)
            _urlretrieve(file_url, os.path.join(self.root, resource))

    def extract_file(self):
        """Extract the .zip file"""
        for resource in self.resources:
            shutil.unpack_archive(os.path.join(self.root, resource), self.root)
            os.remove(os.path.join(self.root, resource))

    def _check_exists(self):
        is_exists = []
        if not os.path.isdir(self.root):
            os.mkdir(self.root)

        for data_mode in self.data_modes:
            data_path = os.path.join(self.root, data_mode)
            is_exists.append(os.path.exists(data_path))

        return all(is_exists)

    def get_path_and_label(self):
        """Get the path of the images and labels (masks) in a dataframe"""
        image_path, label = [], []

        for data_mode in self.data_modes:
            for image in glob.glob(os.path.join(self.root, data_mode, "L7*.TIF")):
                image_path.append(image)

                label.extend(glob.glob(os.path.join(self.root, data_mode, "*mask*")))

        df = pd.DataFrame({"image": image_path, "label": label})
        return df

    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
        """
        Args:
            idx (int): Index
        Returns:
            tuple: (img, mask)
        """
        img_path = self.img_labels.iloc[idx, 0]
        mask_path = self.img_labels.iloc[idx, 1]

        img = np.array(_load_img(img_path))
        mask = np.array(_load_img(mask_path))

        if self.transform is not None:
            img = Image.fromarray(img)
            img = self.transform(img)

        if self.target_transform is not None:
            mask = Image.fromarray(mask)
            mask = self.target_transform(mask)
        return img, mask

    def __len__(self) -> int:
        return len(self.img_labels)

Classes

class L7Irish (root: str, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False)

Landsat 7 Irish Cloud.

https://landsat.usgs.gov/landsat-7-cloud-cover-assessment-validation-data

Args

root : string: Root directory of dataset.
transform : callable, optional: A function/transform that takes in an PIL image and returns a transformed version. E.g, transforms.RandomCrop
target_transform : callable, optional: A function/transform that takes in the target and transforms it.
download : bool, optional: If true, downloads the dataset from the internet and puts it in root directory. If dataset is already downloaded, it is not downloaded again.

Expand source code

class L7Irish(VisionDataset):
    """Landsat 7 Irish Cloud.

    <https://landsat.usgs.gov/landsat-7-cloud-cover-assessment-validation-data>

    Args:
        root (string): Root directory of dataset.
        transform (callable, optional): A function/transform that  takes in an PIL image and
            returns a transformed version. E.g, transforms.RandomCrop
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
    """

    mirrors = "http://landsat.usgs.gov/cloud-validation/cca_irish_2015/"

    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:

        super(L7Irish, self).__init__(root, transform=transform, target_transform=target_transform)

        self.root = root
        self.download_urls = self.get_download_url()
        self.resources = [url.split("/")[-1] for url in self.download_urls]
        self.data_modes = [filename.split(".tar.gz")[0] for filename in self.resources]

        if download and self._check_exists():
            print("file already exists.")

        if download and not self._check_exists():
            self.download()
            self.extract_file()

        self.img_labels = self.get_path_and_label()

    def get_download_url(self):
        """Get the urls to download the files."""
        page = requests.get(
            "https://landsat.usgs.gov/landsat-7-cloud-cover-assessment-validation-data"
        )
        soup = BeautifulSoup(page.content, "html.parser")

        urls = [url.get("href") for url in soup.find_all("a")]
        urls = list(filter(None, urls))

        download_urls = filter(lambda url: url.endswith(".gz"), urls)
        return download_urls

    def download(self):
        """Download file"""
        for resource in self.resources:
            file_url = posixpath.join(self.mirrors, resource)
            _urlretrieve(file_url, os.path.join(self.root, resource))

    def extract_file(self):
        """Extract the .zip file"""
        for resource in self.resources:
            shutil.unpack_archive(os.path.join(self.root, resource), self.root)
            os.remove(os.path.join(self.root, resource))

    def _check_exists(self):
        is_exists = []
        if not os.path.isdir(self.root):
            os.mkdir(self.root)

        for data_mode in self.data_modes:
            data_path = os.path.join(self.root, data_mode)
            is_exists.append(os.path.exists(data_path))

        return all(is_exists)

    def get_path_and_label(self):
        """Get the path of the images and labels (masks) in a dataframe"""
        image_path, label = [], []

        for data_mode in self.data_modes:
            for image in glob.glob(os.path.join(self.root, data_mode, "L7*.TIF")):
                image_path.append(image)

                label.extend(glob.glob(os.path.join(self.root, data_mode, "*mask*")))

        df = pd.DataFrame({"image": image_path, "label": label})
        return df

    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
        """
        Args:
            idx (int): Index
        Returns:
            tuple: (img, mask)
        """
        img_path = self.img_labels.iloc[idx, 0]
        mask_path = self.img_labels.iloc[idx, 1]

        img = np.array(_load_img(img_path))
        mask = np.array(_load_img(mask_path))

        if self.transform is not None:
            img = Image.fromarray(img)
            img = self.transform(img)

        if self.target_transform is not None:
            mask = Image.fromarray(mask)
            mask = self.target_transform(mask)
        return img, mask

    def __len__(self) -> int:
        return len(self.img_labels)

Ancestors

VisionDataset
torch.utils.data.dataset.Dataset
typing.Generic

Class variables

var functions : Dict[str, Callable]
var mirrors

Methods

def download(self)

Download file

Expand source code

def download(self):
    """Download file"""
    for resource in self.resources:
        file_url = posixpath.join(self.mirrors, resource)
        _urlretrieve(file_url, os.path.join(self.root, resource))

def extract_file(self)

Extract the .zip file

Expand source code

def extract_file(self):
    """Extract the .zip file"""
    for resource in self.resources:
        shutil.unpack_archive(os.path.join(self.root, resource), self.root)
        os.remove(os.path.join(self.root, resource))

def get_download_url(self)

Get the urls to download the files.

Expand source code

def get_download_url(self):
    """Get the urls to download the files."""
    page = requests.get(
        "https://landsat.usgs.gov/landsat-7-cloud-cover-assessment-validation-data"
    )
    soup = BeautifulSoup(page.content, "html.parser")

    urls = [url.get("href") for url in soup.find_all("a")]
    urls = list(filter(None, urls))

    download_urls = filter(lambda url: url.endswith(".gz"), urls)
    return download_urls

def get_path_and_label(self)

Get the path of the images and labels (masks) in a dataframe

Expand source code

def get_path_and_label(self):
    """Get the path of the images and labels (masks) in a dataframe"""
    image_path, label = [], []

    for data_mode in self.data_modes:
        for image in glob.glob(os.path.join(self.root, data_mode, "L7*.TIF")):
            image_path.append(image)

            label.extend(glob.glob(os.path.join(self.root, data_mode, "*mask*")))

    df = pd.DataFrame({"image": image_path, "label": label})
    return df