Module earthvision.datasets.spacenet7
SpaceNet 7 Dataset: Multi-Temporal Urban Development Challenge - Instance Segmentation.
Expand source code
"""SpaceNet 7 Dataset: Multi-Temporal Urban Development Challenge - Instance Segmentation."""
from PIL import Image
import os
import shutil
import numpy as np
import pandas as pd
import multiprocessing
from typing import Any, Callable, Optional, Tuple
from .utils import downloader, _load_img
from .vision import VisionDataset
from .spacenet7_utils import map_wrapper, make_geojsons_and_masks
class SpaceNet7(VisionDataset):
"""SpaceNet7 (SN7): Multi-Temporal Urban Development Challenge
<https://spacenet.ai/sn7-challenge/>
Args:
root (string): Root directory of dataset.
train (bool, optional): If True, creates dataset from training set, otherwise
creates from test set.
transform (callable, optional): A function/transform that takes in an PIL image and
returns a transformed version. E.g, transforms.RandomCrop
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If true, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""
resources = {
"train": "s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train.tar.gz",
"test": "s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_test_public.tar.gz",
}
def __init__(
self,
root: str,
train: bool = True,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
download: bool = False,
) -> None:
super(SpaceNet7, self).__init__(
root, transform=transform, target_transform=target_transform
)
self.root = root
self.data_mode = "train" if train else "test"
self.filename = self.resources.get(self.data_mode, "NULL").split("/")[-1]
self.dataset_path = os.path.join(root, self.filename)
data_mode_folder = {"train": "train", "test": "test_public"}
self.folder_name = data_mode_folder.get(self.data_mode, "NULL")
if not os.path.exists(self.root):
os.makedirs(self.root)
if download and self._check_exists(self.dataset_path):
print("file already exists.")
if download and not self._check_exists(os.path.join(self.root, self.folder_name)):
self.download()
self.extract_file()
if self.data_mode == "train":
aois = sorted(
[
f
for f in os.listdir(os.path.join(self.root, "train"))
if os.path.isdir(os.path.join(self.root, "train", f))
]
)
aois_without_mask = []
for aoi in aois:
mask_dir = os.path.join(self.root, "train", aoi, "masks/")
if not self._check_exists(mask_dir):
aois_without_mask.append(aoi)
if aois_without_mask:
print("Generating masks...")
self.generate_mask(aois_without_mask)
self.img_labels = self.get_path_and_label()
def _check_exists(self, obj) -> bool:
if os.path.exists(obj):
return True
else:
return False
def download(self):
"""Download dataset and extract it"""
if self.data_mode not in self.resources.keys():
raise ValueError("Unrecognized data_mode")
downloader(self.resources[self.data_mode], self.root)
def extract_file(self):
shutil.unpack_archive(self.dataset_path, self.root)
def generate_mask(self, aois):
"""
Create Training Masks
Multi-thread to increase speed
We'll only make a 1-channel mask for now, but Solaris supports a multi-channel mask as well, see
https://github.com/CosmiQ/solaris/blob/master/docs/tutorials/notebooks/api_masks_tutorial.ipynb
"""
make_fbc = False
input_args = []
for i, aoi in enumerate(aois):
print(i, "aoi:", aoi)
im_dir = os.path.join(self.root, "train", aoi, "images_masked/")
json_dir = os.path.join(self.root, "train", aoi, "labels_match/")
out_dir_mask = os.path.join(self.root, "train", aoi, "masks/")
out_dir_mask_fbc = os.path.join(self.root, "train", aoi, "masks_fbc/")
os.makedirs(out_dir_mask, exist_ok=True)
if make_fbc:
os.makedirs(out_dir_mask_fbc, exist_ok=True)
json_files = sorted(
[
f
for f in os.listdir(os.path.join(json_dir))
if f.endswith("Buildings.geojson") and os.path.exists(os.path.join(json_dir, f))
]
)
for j, f in enumerate(json_files):
# print(i, j, f)
name_root = f.split(".")[0]
json_path = os.path.join(json_dir, f)
image_path = (
os.path.join(im_dir, name_root + ".tif")
.replace("labels", "images")
.replace("_Buildings", "")
)
output_path_mask = os.path.join(out_dir_mask, name_root + ".tif")
if make_fbc:
output_path_mask_fbc = os.path.join(out_dir_mask_fbc, name_root + ".tif")
else:
output_path_mask_fbc = None
if os.path.exists(output_path_mask):
continue
else:
input_args.append(
[
make_geojsons_and_masks,
name_root,
image_path,
json_path,
output_path_mask,
output_path_mask_fbc,
]
)
p = multiprocessing.Pool(multiprocessing.cpu_count() - 1)
out = p.map(map_wrapper, input_args)
p.close()
p.join()
def get_path_and_label(self):
"""Return dataframe type consist of image path and corresponding label (for train data),
or image path only (for test data)."""
pops = ["train", "test_public"]
for pop in pops:
d = os.path.join(self.root, pop)
im_list, mask_list = [], []
subdirs = sorted([f for f in os.listdir(d) if os.path.isdir(os.path.join(d, f))])
for subdir in subdirs:
if pop == "train":
im_files = [
os.path.join(d, subdir, "images_masked", f)
for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked")))
if f.endswith(".tif")
and os.path.exists(
os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif")
)
]
mask_files = [
os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif")
for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked")))
if f.endswith(".tif")
and os.path.exists(
os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif")
)
]
im_list.extend(im_files)
mask_list.extend(mask_files)
elif pop == "test_public":
im_files = [
os.path.join(d, subdir, "images_masked", f)
for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked")))
if f.endswith(".tif")
]
im_list.extend(im_files)
if self.data_mode == "train":
df = pd.DataFrame({"image": im_list, "label": mask_list})
elif self.data_mode == "test":
df = pd.DataFrame({"image": im_list})
return df
def __getitem__(self, idx: int) -> Tuple[Any, Any]:
"""
Args:
idx (int): Index
Returns:
tuple: (img, mask) or (img)
"""
img_path = self.img_labels.iloc[idx, 0]
img = np.array(_load_img(img_path))
if self.transform is not None:
img = Image.fromarray(img)
img = self.transform(img)
if self.data_mode == "train":
mask_path = self.img_labels.iloc[idx, 1]
mask = np.array(_load_img(mask_path))
if self.target_transform is not None:
mask = Image.fromarray(mask)
mask = self.target_transform(mask)
sample = (img, mask)
elif self.data_mode == "test":
sample = img
return sample
def __len__(self) -> int:
return len(self.img_labels)
Classes
class SpaceNet7 (root: str, train: bool = True, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False)
-
SpaceNet7 (SN7): Multi-Temporal Urban Development Challenge
https://spacenet.ai/sn7-challenge/
Args
root
:string
- Root directory of dataset.
train
:bool
, optional- If True, creates dataset from training set, otherwise creates from test set.
transform
:callable
, optional- A function/transform that takes in an PIL image and returns a transformed version. E.g, transforms.RandomCrop
target_transform
:callable
, optional- A function/transform that takes in the target and transforms it.
download
:bool
, optional- If true, downloads the dataset from the internet and puts it in root directory. If dataset is already downloaded, it is not downloaded again.
Expand source code
class SpaceNet7(VisionDataset): """SpaceNet7 (SN7): Multi-Temporal Urban Development Challenge <https://spacenet.ai/sn7-challenge/> Args: root (string): Root directory of dataset. train (bool, optional): If True, creates dataset from training set, otherwise creates from test set. transform (callable, optional): A function/transform that takes in an PIL image and returns a transformed version. E.g, transforms.RandomCrop target_transform (callable, optional): A function/transform that takes in the target and transforms it. download (bool, optional): If true, downloads the dataset from the internet and puts it in root directory. If dataset is already downloaded, it is not downloaded again. """ resources = { "train": "s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train.tar.gz", "test": "s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_test_public.tar.gz", } def __init__( self, root: str, train: bool = True, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, ) -> None: super(SpaceNet7, self).__init__( root, transform=transform, target_transform=target_transform ) self.root = root self.data_mode = "train" if train else "test" self.filename = self.resources.get(self.data_mode, "NULL").split("/")[-1] self.dataset_path = os.path.join(root, self.filename) data_mode_folder = {"train": "train", "test": "test_public"} self.folder_name = data_mode_folder.get(self.data_mode, "NULL") if not os.path.exists(self.root): os.makedirs(self.root) if download and self._check_exists(self.dataset_path): print("file already exists.") if download and not self._check_exists(os.path.join(self.root, self.folder_name)): self.download() self.extract_file() if self.data_mode == "train": aois = sorted( [ f for f in os.listdir(os.path.join(self.root, "train")) if os.path.isdir(os.path.join(self.root, "train", f)) ] ) aois_without_mask = [] for aoi in aois: mask_dir = os.path.join(self.root, "train", aoi, "masks/") if not self._check_exists(mask_dir): aois_without_mask.append(aoi) if aois_without_mask: print("Generating masks...") self.generate_mask(aois_without_mask) self.img_labels = self.get_path_and_label() def _check_exists(self, obj) -> bool: if os.path.exists(obj): return True else: return False def download(self): """Download dataset and extract it""" if self.data_mode not in self.resources.keys(): raise ValueError("Unrecognized data_mode") downloader(self.resources[self.data_mode], self.root) def extract_file(self): shutil.unpack_archive(self.dataset_path, self.root) def generate_mask(self, aois): """ Create Training Masks Multi-thread to increase speed We'll only make a 1-channel mask for now, but Solaris supports a multi-channel mask as well, see https://github.com/CosmiQ/solaris/blob/master/docs/tutorials/notebooks/api_masks_tutorial.ipynb """ make_fbc = False input_args = [] for i, aoi in enumerate(aois): print(i, "aoi:", aoi) im_dir = os.path.join(self.root, "train", aoi, "images_masked/") json_dir = os.path.join(self.root, "train", aoi, "labels_match/") out_dir_mask = os.path.join(self.root, "train", aoi, "masks/") out_dir_mask_fbc = os.path.join(self.root, "train", aoi, "masks_fbc/") os.makedirs(out_dir_mask, exist_ok=True) if make_fbc: os.makedirs(out_dir_mask_fbc, exist_ok=True) json_files = sorted( [ f for f in os.listdir(os.path.join(json_dir)) if f.endswith("Buildings.geojson") and os.path.exists(os.path.join(json_dir, f)) ] ) for j, f in enumerate(json_files): # print(i, j, f) name_root = f.split(".")[0] json_path = os.path.join(json_dir, f) image_path = ( os.path.join(im_dir, name_root + ".tif") .replace("labels", "images") .replace("_Buildings", "") ) output_path_mask = os.path.join(out_dir_mask, name_root + ".tif") if make_fbc: output_path_mask_fbc = os.path.join(out_dir_mask_fbc, name_root + ".tif") else: output_path_mask_fbc = None if os.path.exists(output_path_mask): continue else: input_args.append( [ make_geojsons_and_masks, name_root, image_path, json_path, output_path_mask, output_path_mask_fbc, ] ) p = multiprocessing.Pool(multiprocessing.cpu_count() - 1) out = p.map(map_wrapper, input_args) p.close() p.join() def get_path_and_label(self): """Return dataframe type consist of image path and corresponding label (for train data), or image path only (for test data).""" pops = ["train", "test_public"] for pop in pops: d = os.path.join(self.root, pop) im_list, mask_list = [], [] subdirs = sorted([f for f in os.listdir(d) if os.path.isdir(os.path.join(d, f))]) for subdir in subdirs: if pop == "train": im_files = [ os.path.join(d, subdir, "images_masked", f) for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked"))) if f.endswith(".tif") and os.path.exists( os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif") ) ] mask_files = [ os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif") for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked"))) if f.endswith(".tif") and os.path.exists( os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif") ) ] im_list.extend(im_files) mask_list.extend(mask_files) elif pop == "test_public": im_files = [ os.path.join(d, subdir, "images_masked", f) for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked"))) if f.endswith(".tif") ] im_list.extend(im_files) if self.data_mode == "train": df = pd.DataFrame({"image": im_list, "label": mask_list}) elif self.data_mode == "test": df = pd.DataFrame({"image": im_list}) return df def __getitem__(self, idx: int) -> Tuple[Any, Any]: """ Args: idx (int): Index Returns: tuple: (img, mask) or (img) """ img_path = self.img_labels.iloc[idx, 0] img = np.array(_load_img(img_path)) if self.transform is not None: img = Image.fromarray(img) img = self.transform(img) if self.data_mode == "train": mask_path = self.img_labels.iloc[idx, 1] mask = np.array(_load_img(mask_path)) if self.target_transform is not None: mask = Image.fromarray(mask) mask = self.target_transform(mask) sample = (img, mask) elif self.data_mode == "test": sample = img return sample def __len__(self) -> int: return len(self.img_labels)
Ancestors
- VisionDataset
- torch.utils.data.dataset.Dataset
- typing.Generic
Class variables
var functions : Dict[str, Callable]
var resources
Methods
def download(self)
-
Download dataset and extract it
Expand source code
def download(self): """Download dataset and extract it""" if self.data_mode not in self.resources.keys(): raise ValueError("Unrecognized data_mode") downloader(self.resources[self.data_mode], self.root)
def extract_file(self)
-
Expand source code
def extract_file(self): shutil.unpack_archive(self.dataset_path, self.root)
def generate_mask(self, aois)
-
Create Training Masks Multi-thread to increase speed We'll only make a 1-channel mask for now, but Solaris supports a multi-channel mask as well, see https://github.com/CosmiQ/solaris/blob/master/docs/tutorials/notebooks/api_masks_tutorial.ipynb
Expand source code
def generate_mask(self, aois): """ Create Training Masks Multi-thread to increase speed We'll only make a 1-channel mask for now, but Solaris supports a multi-channel mask as well, see https://github.com/CosmiQ/solaris/blob/master/docs/tutorials/notebooks/api_masks_tutorial.ipynb """ make_fbc = False input_args = [] for i, aoi in enumerate(aois): print(i, "aoi:", aoi) im_dir = os.path.join(self.root, "train", aoi, "images_masked/") json_dir = os.path.join(self.root, "train", aoi, "labels_match/") out_dir_mask = os.path.join(self.root, "train", aoi, "masks/") out_dir_mask_fbc = os.path.join(self.root, "train", aoi, "masks_fbc/") os.makedirs(out_dir_mask, exist_ok=True) if make_fbc: os.makedirs(out_dir_mask_fbc, exist_ok=True) json_files = sorted( [ f for f in os.listdir(os.path.join(json_dir)) if f.endswith("Buildings.geojson") and os.path.exists(os.path.join(json_dir, f)) ] ) for j, f in enumerate(json_files): # print(i, j, f) name_root = f.split(".")[0] json_path = os.path.join(json_dir, f) image_path = ( os.path.join(im_dir, name_root + ".tif") .replace("labels", "images") .replace("_Buildings", "") ) output_path_mask = os.path.join(out_dir_mask, name_root + ".tif") if make_fbc: output_path_mask_fbc = os.path.join(out_dir_mask_fbc, name_root + ".tif") else: output_path_mask_fbc = None if os.path.exists(output_path_mask): continue else: input_args.append( [ make_geojsons_and_masks, name_root, image_path, json_path, output_path_mask, output_path_mask_fbc, ] ) p = multiprocessing.Pool(multiprocessing.cpu_count() - 1) out = p.map(map_wrapper, input_args) p.close() p.join()
def get_path_and_label(self)
-
Return dataframe type consist of image path and corresponding label (for train data), or image path only (for test data).
Expand source code
def get_path_and_label(self): """Return dataframe type consist of image path and corresponding label (for train data), or image path only (for test data).""" pops = ["train", "test_public"] for pop in pops: d = os.path.join(self.root, pop) im_list, mask_list = [], [] subdirs = sorted([f for f in os.listdir(d) if os.path.isdir(os.path.join(d, f))]) for subdir in subdirs: if pop == "train": im_files = [ os.path.join(d, subdir, "images_masked", f) for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked"))) if f.endswith(".tif") and os.path.exists( os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif") ) ] mask_files = [ os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif") for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked"))) if f.endswith(".tif") and os.path.exists( os.path.join(d, subdir, "masks", f.split(".")[0] + "_Buildings.tif") ) ] im_list.extend(im_files) mask_list.extend(mask_files) elif pop == "test_public": im_files = [ os.path.join(d, subdir, "images_masked", f) for f in sorted(os.listdir(os.path.join(d, subdir, "images_masked"))) if f.endswith(".tif") ] im_list.extend(im_files) if self.data_mode == "train": df = pd.DataFrame({"image": im_list, "label": mask_list}) elif self.data_mode == "test": df = pd.DataFrame({"image": im_list}) return df