Data module

The data module is intended for preparing data for training. It heavily depends on the FiftyOne package and its integrations. There are the following submodules:

Submodules

annotations - send data for annotation in CVAT and fetch annotations.
brain - commands for fiftyone.brain module.
display - print dataset-related stats to a console.
export - export datasets to different formats (if missing in the original fiftyone cli).
tag - tag dataset samples that meet certain criteria.
transforms - perform changes on datasets.
zoo - perform operations with fiftyone.zoo module.

`finegrained.data.annotations`

Send, query and get annotation results.

`annotate(dataset, annotation_key, label_field, backend, overwrite=False, label_type=None, project_id=None, segment_size=10, task_name=None, image_quality=75, task_asignee=None, organization=None, classes=None, **kwargs)`

Send samples to annotations

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset with samples	required
`annotation_key`	`str`	assign this key for annotation run	required
`label_field`	`str`	if exists, upload labels	required
`label_type`	`Optional[str]`	if label_field does not exist, this has to be specified	`None`
`backend`	`Any`	backend name or filepath to configs	required
`overwrite`	`bool`	overwrite existing annotation run if True	`False`
`classes`	`Optional[str]`	list of classes or path to labels.txt file	`None`
`image_quality`	`int`	image upload quality	`75`
`task_name`	`Optional[str]`	custom task name, by default dataset name + annotation key	`None`
`segment_size`	`int`	number of frames/images per one job	`10`
`project_id`	`Optional[int]`	which cvat project to connect to	`None`
`task_asignee`	`Optional[str]`	assignee for the task	`None`
`**kwargs`		dataset loading filters	`{}`

Source code in finegrained/data/annotations.py

def annotate(
    dataset: str,
    annotation_key: str,
    label_field: str,
    backend: Any,
    overwrite: bool = False,
    label_type: Optional[str] = None,
    project_id: Optional[int] = None,
    segment_size: int = 10,
    task_name: Optional[str] = None,
    image_quality: int = 75,
    task_asignee: Optional[str] = None,
    organization: Optional[str] = None,
    classes: Optional[str] = None,
    **kwargs,
):
    """Send samples to annotations

    Args:
        dataset: fiftyone dataset with samples
        annotation_key: assign this key for annotation run
        label_field: if exists, upload labels
        label_type: if label_field does not exist, this has to be specified
        backend: backend name or filepath to configs
        overwrite: overwrite existing annotation run if True
        classes: list of classes or path to labels.txt file
        image_quality: image upload quality
        task_name: custom task name, by default dataset name + annotation key
        segment_size: number of frames/images per one job
        project_id: which cvat project to connect to
        task_asignee: assignee for the task
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    backend_conf = _load_backend_config(backend)
    if dataset.has_annotation_run(annotation_key) and overwrite:
        dataset.delete_annotation_run(annotation_key)
    if not dataset.has_sample_field(label_field) and label_type is None:
        raise ValueError(
            f"{label_field=} does not exist in {dataset.name}. Specify 'label_type'"
        )
    if classes:
        classes = _parse_classes(classes)
    if task_name is None:
        task_name = dataset.name + " - " + annotation_key
    dataset.annotate(
        annotation_key,
        label_field=label_field,
        label_type=label_type,
        project_id=project_id,
        segment_size=segment_size,
        task_name=task_name,
        image_quality=image_quality,
        classes=classes,
        task_asignee=task_asignee,
        organization=organization,
        **backend_conf,
    )

`delete_key(dataset, key)`

Delete an annotation key.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`key`	`str`	annotation key	required

Returns:

Type	Description
	none

Source code in finegrained/data/annotations.py

def delete_key(dataset: str, key: str):
    """Delete an annotation key.

    Args:
        dataset: fiftyone dataset name
        key: annotation key

    Returns:
        none
    """
    dataset = load_fiftyone_dataset(dataset)
    dataset.delete_annotation_run(key)

`list_keys(dataset)`

List annotation keys attributed to the dataset

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required

Returns:

Type	Description
`types.LIST_STR`	a list of keys

Source code in finegrained/data/annotations.py

def list_keys(dataset: str) -> types.LIST_STR:
    """List annotation keys attributed to the dataset

    Args:
        dataset: fiftyone dataset name

    Returns:
        a list of keys
    """
    dataset = load_fiftyone_dataset(dataset)
    keys = dataset.list_annotation_runs()
    return keys

`load(dataset, annotation_key, backend, dest_field=None, dataset_kwargs=None)`

Download annotations from an annotation backend.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`annotation_key`	`str`	annotation key used to send for annotations	required
`backend`	`Any`	annotation backend name or filepath with configs	required
`dest_field`	`str`	if given, annotations will be stored in a new field	`None`
`dataset_kwargs`	`Optional[dict]`	dataset loading filters	`None`

Returns:

Type	Description
	none

Source code in finegrained/data/annotations.py

def load(
    dataset: str,
    annotation_key: str,
    backend: Any,
    dest_field: str = None,
    dataset_kwargs: Optional[dict] = None,
):
    """Download annotations from an annotation backend.

    Args:
        dataset: fiftyone dataset name
        annotation_key: annotation key used to send for annotations
        backend: annotation backend name or filepath with configs
        dest_field: if given, annotations will be stored in a new field
        dataset_kwargs: dataset loading filters

    Returns:
        none
    """
    backend_conf = _load_backend_config(backend)
    backend_conf.pop("backend")
    dataset = load_fiftyone_dataset(
        dataset, **dataset_kwargs if bool(dataset_kwargs) else {}
    )
    dataset.load_annotations(annotation_key, dest_field=dest_field, **backend_conf)

`finegrained.data.brain`

Run fiftyone.brain operations on a dataset.

`compute_hardness(dataset, predictions, **kwargs)`

Estimate how difficult is this sample to predict.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`predictions`	`str`	field with predictions	required
`**kwargs`		dataset filters	`{}`

Returns:

Type	Description
	None

Source code in finegrained/data/brain.py

def compute_hardness(dataset: str, predictions: str, **kwargs):
    """Estimate how difficult is this sample to predict.

    Args:
        dataset: fiftyone dataset name
        predictions: field with predictions
        **kwargs: dataset filters

    Returns:
        None
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    fob.compute_hardness(dataset, predictions)

`compute_mistakenness(dataset, predictions, gt_field='ground_truth', **kwargs)`

Estimate a probability that a ground truth label is wrong

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`predictions`	`str`	a field that contains model predictions	required
`gt_field`	`str`	a field that contains ground truth data	`'ground_truth'`
`**kwargs`		dataset loading filters	`{}`

Returns:

Type	Description
	none

Source code in finegrained/data/brain.py

def compute_mistakenness(
    dataset: str, predictions: str, gt_field: str = "ground_truth", **kwargs
):
    """Estimate a probability that a ground truth label is wrong

    Args:
        dataset: fiftyone dataset name
        predictions: a field that contains model predictions
        gt_field: a field that contains ground truth data
        **kwargs: dataset loading filters

    Returns:
        none
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    fob.compute_mistakenness(dataset, predictions, label_field=gt_field)

`finegrained.data.display`

Display various data about datasets.

`compute_area(dataset, field='area', average_size=False, overwrite_metadata=False, overwrite=False, **kwargs)`

Calculate area of an image based on metadata

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`field`	`str`	field where to assign area values	`'area'`
`average_size`	`bool`	if True, calculate (width + height)/2 instead	`False`
`overwrite_metadata`	`bool`	whether to overwrite metadata	`False`
`overwrite`	`bool`	delete field if already exists	`False`
`**kwargs`		dataset loading filters	`{}`

Returns:

Type	Description
`tuple[int, int]`	area bounds

Source code in finegrained/data/display.py

def compute_area(
    dataset: str,
    field: str = "area",
    average_size: bool = False,
    overwrite_metadata: bool = False,
    overwrite: bool = False,
    **kwargs,
) -> tuple[int, int]:
    """Calculate area of an image based on metadata

    Args:
        dataset: fiftyone dataset name
        field: field where to assign area values
        average_size: if True, calculate (width + height)/2 instead
        overwrite_metadata: whether to overwrite metadata
        overwrite: delete field if already exists
        **kwargs: dataset loading filters

    Returns:
        area bounds
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    if dataset.has_sample_field(field):
        if overwrite:
            delete_field(dataset.name, field)
        else:
            raise ValueError(f"{field=} already exists.")

    dataset.compute_metadata(overwrite=overwrite_metadata)
    for smp in dataset.select_fields("metadata"):
        val = (
            (smp.metadata.width + smp.metadata.height) / 2
            if average_size
            else smp.metadata.width * smp.metadata.height
        )
        smp[field] = val
        smp.save()

    return dataset.bounds(field)

`eval_report(dataset, predictions, gt_field='ground_truth', cmat=False, eval_kwargs={}, **kwargs)`

Print evaluation report: compare prediction field against ground_truth field.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`predictions`	`str`	a field with predictions	required
`gt_field`	`str`	a field with ground truth labels	`'ground_truth'`
`cmat`	`bool`	if True, plot a confusion matrix	`False`
`eval_kwargs`	`dict`	if passed, these params will be passed to the evaluation function	`{}`
`**kwargs`		dataset loading filters	`{}`

Source code in finegrained/data/display.py

def eval_report(
    dataset: str,
    predictions: str,
    gt_field: str = "ground_truth",
    cmat: bool = False,
    eval_kwargs: dict = {},
    **kwargs,
):
    """Print evaluation report: compare prediction field against ground_truth field.

    Args:
        dataset: fiftyone dataset name
        predictions: a field with predictions
        gt_field: a field with ground truth labels
        cmat: if True, plot a confusion matrix
        eval_kwargs: if passed, these params will be passed to the evaluation function
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    labels = get_unique_labels(dataset, gt_field)
    label_type = dataset.get_field(gt_field).document_type

    eval_fn = None
    match label_type:
        case fo.Classification:
            eval_fn = dataset.evaluate_classifications
        case fo.Detections | fo.Keypoints | fo.Polylines:
            eval_fn = dataset.evaluate_detections
        case fo.Segmentation:
            eval_fn = dataset.evaluate_segmentations
    if not eval_fn:
        raise NotImplementedError(f"evaluation for {label_type=} not implemented.")

    results = eval_fn(predictions, gt_field=gt_field, classes=labels, **eval_kwargs)
    results.print_report()
    if hasattr(results, "mAP") and (mAP := results.mAP()):
        iou = eval_kwargs.get("iou", 0.5)
        print(f"> {mAP=:.4f} @ {iou=}")
    if cmat:
        cm = results.plot_confusion_matrix(backend="matplotlib")
        cm.show()

`label_diff(dataset, label_field, tags_left, tags_right)`

Compute difference between two sets of labels.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	field with labels	required
`tags_left`	`types.LIST_STR_STR`	list of tags for base list of labels	required
`tags_right`	`types.LIST_STR_STR`	list of tags for intersection comparison	required

Source code in finegrained/data/display.py

def label_diff(
    dataset: str,
    label_field: str,
    tags_left: types.LIST_STR_STR,
    tags_right: types.LIST_STR_STR,
):
    """Compute difference between two sets of labels.

    Args:
        dataset: fiftyone dataset name
        label_field: field with labels
        tags_left: list of tags for base list of labels
        tags_right: list of tags for intersection comparison
    """
    # TODO test this
    dataset = load_fiftyone_dataset(dataset)
    assert dataset.has_sample_field(label_field)
    assert len(tags_left) > 0
    assert len(tags_right) > 0

    left_labels = get_unique_labels(dataset.match_tags(tags_left), label_field)
    right_labels = get_unique_labels(dataset.match_tags(tags_right), label_field)

    diff = find_diff(left_labels, right_labels)
    return diff

`print_labels(dataset, label_field, **kwargs)`

Print all classes in the dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	field that contains labels	required
`**kwargs`		dataset loading filters	`{}`

Source code in finegrained/data/display.py

def print_labels(dataset: str, label_field: str, **kwargs) -> None:
    """Print all classes in the dataset.

    Args:
        dataset: fiftyone dataset name
        label_field: field that contains labels
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    labels = get_unique_labels(dataset, label_field)
    print("\n".join(labels))

`finegrained.data.export`

Dataset converting and exporting utils.

`to_csv(dataset, label_field, export_path, extra_fields=None, **kwargs)`

Export a dataset into CSV format for uploading to external sources.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	field that contains labels (will be mapped to 'label')	required
`export_path`	`str`	where to write csv file	required
`extra_fields`	`Optional[list[str]]`	extra fields to be added to csv	`None`
`**kwargs`		dataset loading filters	`{}`

Source code in finegrained/data/export.py

def to_csv(
    dataset: str,
    label_field: str,
    export_path: str,
    extra_fields: Optional[list[str]] = None,
    **kwargs,
):
    """Export a dataset into CSV format for uploading to external sources.

    Args:
        dataset: fiftyone dataset name
        label_field: field that contains labels (will be mapped to 'label')
        export_path: where to write csv file
        extra_fields: extra fields to be added to csv
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    label_field = f"{label_field}.label"
    fields = {"filepath": "image", label_field: "label"}
    if extra_fields:
        fields.update({k: k for k in extra_fields})
    dataset.export(
        dataset_type=fot.CSVDataset,
        abs_paths=True,
        export_media=False,
        labels_path=export_path,
        fields=fields,
    )

`to_cvat(dataset, label_field, export_dir, **kwargs)`

Export a dataset into CVAT format for annotation.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	field that contains labels	required
`export_dir`	`str`	where to write data	required
`**kwargs`		dataset loading filters	`{}`

Source code in finegrained/data/export.py

def to_cvat(dataset: str, label_field: str, export_dir: str, **kwargs):
    """Export a dataset into CVAT format for annotation.

    Args:
        dataset: fiftyone dataset name
        label_field: field that contains labels
        export_dir: where to write data
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    dataset.export(
        export_dir=export_dir,
        dataset_type=fot.CVATImageDataset,
        label_field=label_field,
    )

`to_yolov5(dataset, label_field, export_dir, splits, **kwargs)`

Export a dataset into yolov5 format for training

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	field that contains labels	required
`export_dir`	`str`	where to write data	required
`splits`	`List[str]`	which splits to export	required
`**kwargs`		dataset loading filters	`{}`

Source code in finegrained/data/export.py

def to_yolov5(
    dataset: str,
    label_field: str,
    export_dir: str,
    splits: List[str],
    **kwargs,
):
    """Export a dataset into yolov5 format for training

    Args:
        dataset: fiftyone dataset name
        label_field: field that contains labels
        export_dir: where to write data
        splits: which splits to export
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    labels = get_unique_labels(dataset, label_field)
    for tag in splits:
        subset = dataset.match_tags(tag)
        assert len(subset) > 0, f"No samples in the subset with {tag=}"
        subset.export(
            export_dir=export_dir,
            split=tag,
            dataset_type=fot.YOLOv5Dataset,
            label_field=label_field,
            classes=labels,
        )

`finegrained.data.tag`

Tag or untag samples with specific filters or condition

`retag_missing_labels(dataset, label_field, from_tags, to_tags)`

Remove from_tags and add to_tags for labels that are present in from_tags but absent in to_tags.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	a label field	required
`from_tags`	`types.LIST_STR_STR`	tags with base list of class labels	required
`to_tags`	`types.LIST_STR_STR`	tags with intersection of class labels	required

Returns:

Type	Description
`dict`	a count of sample tags for a subset

Source code in finegrained/data/tag.py

def retag_missing_labels(
    dataset: str,
    label_field: str,
    from_tags: types.LIST_STR_STR,
    to_tags: types.LIST_STR_STR,
) -> dict:
    """Remove from_tags and add to_tags for labels that are present in
        from_tags but absent in to_tags.

    Args:
        dataset: fiftyone dataset name
        label_field: a label field
        from_tags: tags with base list of class labels
        to_tags: tags with intersection of class labels

    Returns:
        a count of sample tags for a subset
    """
    # TODO test this
    diff = label_diff(dataset, label_field, tags_left=from_tags, tags_right=to_tags)
    assert len(diff) > 0, "No samples to retag"

    dataset = load_fiftyone_dataset(dataset, include_labels={label_field: diff})
    dataset.untag_samples(from_tags)
    dataset.tag_samples(to_tags)

    return dataset.count_sample_tags()

`split_classes(dataset, label_field, train_size=0.5, val_size=0.5, min_samples=3, split_names=('train', 'val'), overwrite=False)`

Split classes in a dataset into train and val.

Used for meta-learning.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	which field to use for classes	required
`train_size`	`float`	fraction of classes to tag as train	`0.5`
`val_size`	`float`	fraction of classes to tag as val	`0.5`
`min_samples`	`int`	minimum number of samples per class to include a class into a split	`3`
`split_names`	`tuple[str, str]`	splits will be tagged with these names	`('train', 'val')`
`overwrite`	`bool`	if True, existing tags are removed	`False`

Returns:

Type	Description
`types.DICT_STR_FLOAT`	a dict of tag counts

Source code in finegrained/data/tag.py

def split_classes(
    dataset: str,
    label_field: str,
    train_size: float = 0.5,
    val_size: float = 0.5,
    min_samples: int = 3,
    split_names: tuple[str, str] = ("train", "val"),
    overwrite: bool = False,
) -> types.DICT_STR_FLOAT:
    """Split classes in a dataset into train and val.

    Used for meta-learning.

    Args:
        dataset: fiftyone dataset name
        label_field: which field to use for classes
        train_size: fraction of classes to tag as train
        val_size: fraction of classes to tag as val
        min_samples: minimum number of samples
            per class to include a class into a split
        split_names: splits will be tagged with these names
        overwrite: if True, existing tags are removed

    Returns:
        a dict of tag counts
    """
    dataset = load_fiftyone_dataset(dataset)
    label_counts = dataset.count_values(f"{label_field}.label")
    labels = list(filter(lambda x: label_counts[x] >= min_samples, label_counts))
    train_labels, val_labels = train_test_split(
        labels, test_size=val_size, train_size=train_size, shuffle=True
    )
    if overwrite:
        dataset.untag_samples(split_names)
    train_view = dataset.filter_labels(label_field, F("label").is_in(train_labels))
    train_view.tag_samples(split_names[0])
    val_view = dataset.filter_labels(label_field, F("label").is_in(val_labels))
    val_view.tag_samples(split_names[1])
    return dataset.count_sample_tags()

`split_dataset(dataset, splits={'train': 0.8, 'val': 0.1, 'test': 0.1}, **kwargs)`

Create data split tags for a dataset

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset	required
`splits`	`types.DICT_STR_FLOAT`	a dict of split names and relative sizes	`{'train': 0.8, 'val': 0.1, 'test': 0.1}`
`kwargs`		dataset loading filters	`{}`

Returns:

Type	Description
	a dict of split counts

Source code in finegrained/data/tag.py

def split_dataset(
    dataset: str,
    splits: types.DICT_STR_FLOAT = {"train": 0.8, "val": 0.1, "test": 0.1},
    **kwargs,
):
    """Create data split tags for a dataset

    Args:
        dataset: fiftyone dataset
        splits: a dict of split names and relative sizes
        kwargs: dataset loading filters

    Returns:
        a dict of split counts
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    four.random_split(dataset, splits)
    return dataset.count_sample_tags()

`tag_alignment(dataset, vertical=True, tag=None, **kwargs)`

Add a vertical/horizontal tag each sample.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`vertical`	`bool`	if True, vertical images are tagged. If False, horizontal images are tagged.	`True`
`tag`	`Optional[str]`	overwrite default 'vertical' or 'horizontal' tag.	`None`
`**kwargs`		dataset filter kwargs	`{}`

Returns:

Type	Description
`dict`	a dict with sample tag counts

Source code in finegrained/data/tag.py

def tag_alignment(
    dataset: str, vertical: bool = True, tag: Optional[str] = None, **kwargs
) -> dict:
    """Add a vertical/horizontal tag each sample.

    Args:
        dataset: fiftyone dataset name
        vertical: if True, vertical images are tagged.
            If False, horizontal images are tagged.
        tag: overwrite default 'vertical' or 'horizontal' tag.
        **kwargs: dataset filter kwargs

    Returns:
        a dict with sample tag counts
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    dataset.compute_metadata()
    if vertical:
        tag = "vertical" if tag is None else tag
        tag_view = dataset.match(F("metadata.height") > F("metadata.width"))
    else:
        tag = "horizontal" if tag is None else tag
        tag_view = dataset.match(F("metadata.width") >= F("metadata.height"))
    tag_view.tag_samples(tag)
    return tag_view.count_sample_tags()

`tag_labels(dataset, label_field, labels, tags)`

Tag labels with given tags.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	a label field	required
`labels`	`types.LIST_STR_STR`	labels to filter, can be a txt file with labels	required
`tags`	`types.LIST_STR_STR`	tags to apply	required

Returns:

Type	Description
`dict`	a count of label tags for a subset

Source code in finegrained/data/tag.py

def tag_labels(
    dataset: str,
    label_field: str,
    labels: types.LIST_STR_STR,
    tags: types.LIST_STR_STR,
) -> dict:
    """Tag labels with given tags.

    Args:
        dataset: fiftyone dataset name
        label_field: a label field
        labels: labels to filter, can be a txt file with labels
        tags: tags to apply

    Returns:
        a count of label tags for a subset
    """
    if (lab := Path(labels)).is_file():
        labels = lab.read_text().strip().split("\n")
    dataset = load_fiftyone_dataset(dataset, include_labels={label_field: labels})
    dataset.tag_labels(tags, label_fields=label_field)
    return dataset.count_label_tags(label_fields=label_field)

`tag_samples(dataset, tags, **kwargs)`

Tag each sample in dataset with given tags

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`tags`	`types.LIST_STR_STR`	tags to apply	required
`kwargs`		dataset loading kwargs, i.e. filters	`{}`

Returns:

Type	Description
`dict`	a dict of sample tag counts

Source code in finegrained/data/tag.py

def tag_samples(dataset: str, tags: types.LIST_STR_STR, **kwargs) -> dict:
    """Tag each sample in dataset with given tags

    Args:
        dataset: fiftyone dataset name
        tags: tags to apply
        kwargs: dataset loading kwargs, i.e. filters

    Returns:
        a dict of sample tag counts
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    dataset.tag_samples(parse_list_str(tags))
    return dataset.count_sample_tags()

`finegrained.data.transforms`

Data transforms on top of fiftyone datasets.

`combine_datasets(dest_name, label_field, cfg, persistent=True, overwrite=False)`

Create a new dataset by adding samples from multiple datasets.

List of datasets and filters are specified in a yaml config file. Source label fields will be renamed to a destination label field.

Parameters:

Name	Type	Description	Default
`dest_name`	`str`	a new dataset name	required
`label_field`	`str`	a new label field	required
`cfg`	`str`	path to yaml config	required
`persistent`	`bool`	whether to persist destination dataset (False for testing)	`True`
`overwrite`	`bool`	if dataset exists, overwrite it	`False`

Returns:

Type	Description
	a dataset instance

Source code in finegrained/data/transforms.py

def combine_datasets(
    dest_name: str,
    label_field: str,
    cfg: str,
    persistent: bool = True,
    overwrite: bool = False,
):
    """Create a new dataset by adding samples from multiple datasets.

    List of datasets and filters are specified in a yaml config file.
    Source label fields will be renamed to a destination label field.

    Args:
        dest_name: a new dataset name
        label_field: a new label field
        cfg: path to yaml config
        persistent: whether to persist destination dataset (False for testing)
        overwrite: if dataset exists, overwrite it

    Returns:
        a dataset instance
    """
    cfg = read_yaml(cfg)
    assert "datasets" in cfg and isinstance(dataset_cfg := cfg["datasets"], list)
    assert len(dataset_cfg) > 0

    dataset = create_fiftyone_dataset(
        dest_name, src=None, persistent=persistent, overwrite=overwrite
    )
    for one in dataset_cfg:
        assert "name" in one and isinstance(one["name"], str)
        if "filters" in one:
            assert isinstance(one["filters"], dict)
        else:
            one["filters"] = {}
        assert "label_field" in one and isinstance(one["label_field"], str)

        temp_name = f"{dest_name}_{one['name']}"
        temp = load_fiftyone_dataset(one["name"], **one["filters"]).clone(
            name=temp_name, persistent=False
        )
        temp.clone_sample_field(one["label_field"], label_field)
        if "tags" in one:
            temp.tag_samples(one["tags"])

        dataset.add_samples(temp.select_fields([label_field, "tags"]))
        fo.delete_dataset(temp_name)

    return dataset

`delete_field(dataset, fields)`

Delete one or more fields from a dataset

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`fields`	`types.LIST_STR_STR`	fields to delete	required

Returns:

Type	Description
	a fiftyone dataset

Source code in finegrained/data/transforms.py

def delete_field(dataset: str, fields: types.LIST_STR_STR):
    """Delete one or more fields from a dataset

    Args:
        dataset: fiftyone dataset name
        fields: fields to delete

    Returns:
        a fiftyone dataset
    """
    dataset = load_fiftyone_dataset(dataset)
    fields = parse_list_str(fields)
    for field in fields:
        dataset.delete_sample_field(field)
        print(f"{field=} deleted from {dataset.name=}")
    return dataset

`delete_samples(dataset, **kwargs)`

Delete samples and associated files from a dataset

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`**kwargs`		dataset filters to select samples for deletion (must be provided)	`{}`

Returns:

Type	Description
	None

Source code in finegrained/data/transforms.py

def delete_samples(dataset: str, **kwargs):
    """Delete samples and associated files from a dataset

    Args:
        dataset: fiftyone dataset name
        **kwargs: dataset filters to select samples for deletion
            (must be provided)

    Returns:
        None
    """
    assert bool(kwargs), "Danger: provide dataset filters to select a subset"
    subset = load_fiftyone_dataset(dataset, **kwargs)
    delete_ids = []
    for smp in subset.select_fields(["id", "filepath"]):
        Path(smp.filepath).unlink()
        delete_ids.append(smp.id)

    full_dataset = fo.load_dataset(dataset)
    full_dataset.delete_samples(delete_ids)
    print(f"{len(delete_ids)} files deleted and removed from {dataset=}")

`exif_transpose(dataset, **kwargs)`

Rotate images that have a PIL rotate tag

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`**kwargs`		dataset loading filters	`{}`

Returns:

Type	Description
	None

Source code in finegrained/data/transforms.py

def exif_transpose(dataset: str, **kwargs):
    """Rotate images that have a PIL rotate tag

    Args:
        dataset: fiftyone dataset name
        **kwargs: dataset loading filters

    Returns:
        None
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    for smp in tqdm(dataset.select_fields("filepath"), desc="transposing"):
        try:
            orig = Image.open(smp.filepath)
            transposed = ImageOps.exif_transpose(orig)
            transposed.save(smp.filepath)
        except UnidentifiedImageError as e:
            print(e, "at", smp.filepath)

`fix_filepath(src, from_dir, to_dir)`

Replace from_dir part to to_dir in each sample's filepath in samples.json file.

Samples.json file is updated inplace.

Parameters:

Name	Type	Description	Default
`src`	`str`	sample.json file export for fiftyone.types.FiftyOneDataset	required
`from_dir`	`str`	relative directory to replace	required
`to_dir`	`str`	new relative directory	required

Source code in finegrained/data/transforms.py

def fix_filepath(src: str, from_dir: str, to_dir: str) -> None:
    """Replace from_dir part to to_dir in each sample's filepath in samples.json file.

    Samples.json file is updated inplace.

    Args:
        src: sample.json file export for fiftyone.types.FiftyOneDataset
        from_dir: relative directory to replace
        to_dir: new relative directory
    """
    # TODO test this
    src = Path(src)
    assert src.exists() and src.suffix == ".json"
    samples = read_json(src)

    to_dir = Path(to_dir)

    def fix_path(path):
        return str(to_dir / Path(path).relative_to(from_dir))

    for smp in samples["samples"]:
        smp["filepath"] = fix_path(smp["filepath"])

    write_json(samples, src)

`from_label_tag(dataset, label_field, label_tag, **kwargs)`

Update a label_field label with its label_tag.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	a field that contains detections labels.	required
`label_tag`	`str`	labels that contain this tag, will be renamed to it.	required
`**kwargs`		dataset loading filters	`{}`

Returns:

Type	Description
`dict`	updated label values

Source code in finegrained/data/transforms.py

def from_label_tag(dataset: str, label_field: str, label_tag: str, **kwargs) -> dict:
    """Update a label_field label with its label_tag.

    Args:
        dataset: fiftyone dataset name
        label_field: a field that contains detections labels.
        label_tag: labels that contain this tag, will be renamed to it.
        **kwargs: dataset loading filters

    Returns:
        updated label values
    """
    kwargs = kwargs | {"label_tags": label_tag}
    dataset = load_fiftyone_dataset(dataset, **kwargs)

    for smp in tqdm(dataset.select_fields(label_field), desc="updating samples"):
        for det in smp[label_field].detections:
            if label_tag in det.tags:
                det.label = label_tag
                smp.save()

    return dataset.count_values(f"{label_field}.detections.label")

`from_labels(dataset, label_field, from_field, **kwargs)`

Re-assign classification label to detection labels.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	a field with detections to be updated	required
`from_field`	`str`	a field with classification to get labels from	required
`**kwargs`		dataset loading filters	`{}`

Source code in finegrained/data/transforms.py

def from_labels(dataset: str, label_field: str, from_field: str, **kwargs):
    """Re-assign classification label to detection labels.

    Args:
        dataset: fiftyone dataset name
        label_field: a field with detections to be updated
        from_field: a field with classification to get labels from
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    dataset = dataset.exists(label_field)

    assert dataset.has_sample_field(
        label_field
    ), f"Dataset does not contain {label_field=}."
    assert (
        doc_type := dataset.get_field(label_field).document_type
    ) == fo.Detections, f"{label_field=} has to be of type Detections, got {doc_type=}."
    assert dataset.has_sample_field(
        from_field
    ), f"Dataset does not contain {from_field=}."
    assert (
        doc_type := dataset.get_field(from_field).document_type
    ) == fo.Classification, (
        f"{from_field=} has to be of type Detections, got {doc_type=}."
    )

    for smp in tqdm(dataset.select_fields([label_field, from_field])):
        _update_labels(smp[label_field], smp[from_field].label)
        smp.save()

`map_labels(dataset, from_field, to_field, label_mapping=None, overwrite=False, **kwargs)`

Create a new dataset field with mapped labels.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`from_field`	`str`	source label field	required
`to_field`	`str`	a new label field	required
`label_mapping`	`Optional[dict]`	label mapping (use {}/None for creating a field copy)	`None`
`overwrite`	`bool`	if to_field already exists, then overwrite it	`False`
`**kwargs`		dataset loading kwargs	`{}`

Returns:

Type	Description
`fo.DatasetView`	dataset view

Source code in finegrained/data/transforms.py

def map_labels(
    dataset: str,
    from_field: str,
    to_field: str,
    label_mapping: Optional[dict] = None,
    overwrite: bool = False,
    **kwargs,
) -> fo.DatasetView:
    """Create a new dataset field with mapped labels.

    Args:
        dataset: fiftyone dataset name
        from_field: source label field
        to_field: a new label field
        label_mapping: label mapping (use {}/None for creating a field copy)
        overwrite: if to_field already exists, then overwrite it
        **kwargs: dataset loading kwargs

    Returns:
        dataset view
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)

    if overwrite and dataset.has_sample_field(to_field):
        delete_field(dataset.dataset_name, to_field)
    elif not overwrite and dataset.has_sample_field(to_field):
        raise ValueError(f"{to_field=} already exists")

    dataset.clone_sample_field(from_field, to_field)
    if bool(label_mapping):
        dataset = dataset.map_labels(to_field, label_mapping)
        dataset.save(to_field)
    return dataset

`merge_diff(dataset, image_dir, tags=None, recursive=True)`

Merge new files into an existing dataset.

Existing files will be skipped. No labels for new files are expected. Merger happens based on an absolute filepath.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	existing fiftyone dataset	required
`image_dir`	`str`	a folder with new files	required
`tags`	`types.LIST_STR_STR`	tag new samples	`None`
`recursive`	`bool`	search for files in subfolders as well	`True`

Returns:

Type	Description
	an updated fiftyone dataset

Source code in finegrained/data/transforms.py

def merge_diff(
    dataset: str,
    image_dir: str,
    tags: types.LIST_STR_STR = None,
    recursive: bool = True,
):
    """Merge new files into an existing dataset.

    Existing files will be skipped.
    No labels for new files are expected.
    Merger happens based on an absolute filepath.

    Args:
        dataset: existing fiftyone dataset
        image_dir: a folder with new files
        tags: tag new samples
        recursive: search for files in subfolders as well

    Returns:
        an updated fiftyone dataset
    """
    dataset = load_fiftyone_dataset(dataset)
    second = fo.Dataset.from_images_dir(image_dir, tags=tags, recursive=recursive)
    dataset.merge_samples(second, skip_existing=True)
    return dataset

`prefix_label(dataset, label_field, dest_field, prefix)`

Prepend each label with given prefix

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	a field with class labels	required
`dest_field`	`str`	a new field to create with '_' values	required
`prefix`	`str`	a prefix value	required

Returns:

Type	Description
	fiftyone dataset object

Source code in finegrained/data/transforms.py

def prefix_label(dataset: str, label_field: str, dest_field: str, prefix: str):
    """Prepend each label with given prefix

    Args:
        dataset: fiftyone dataset name
        label_field: a field with class labels
        dest_field: a new field to create with '<prefix>_<label>' values
        prefix: a prefix value

    Returns:
        fiftyone dataset object
    """
    dataset = load_fiftyone_dataset(dataset)
    values = [
        fo.Classification(label=f"{prefix}_{smp[label_field].label}")
        for smp in dataset.select_fields(label_field)
    ]
    dataset.set_values(dest_field, values)
    return dataset

`to_patches(dataset, label_field, to_name, export_dir, overwrite=False, splits=None, **kwargs)`

Crop out patches from a dataset and create a new one.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	a fiftyone dataset with detections	required
`label_field`	`str \| list[str]`	label field(s) with detection, classification or polylines	required
`to_name`	`str`	a new dataset name for patches	required
`export_dir`	`str`	where to save crops	required
`overwrite`	`bool`	if True and that name already exists, delete it	`False`
`splits`	`Optional[list[str]]`	if provided, these tags will be used to split patches into subsets	`None`
`**kwargs`		dataset filters	`{}`

Returns:

Type	Description
`fo.Dataset`	fiftyone dataset object

Source code in finegrained/data/transforms.py

def to_patches(
    dataset: str,
    label_field: str | list[str],
    to_name: str,
    export_dir: str,
    overwrite: bool = False,
    splits: Optional[list[str]] = None,
    **kwargs,
) -> fo.Dataset:
    """Crop out patches from a dataset and create a new one.

    Args:
        dataset: a fiftyone dataset with detections
        label_field: label field(s) with detection, classification or polylines
        to_name: a new dataset name for patches
        export_dir: where to save crops
        overwrite: if True and that name already exists, delete it
        splits: if provided, these tags will be used to split patches into subsets
        **kwargs: dataset filters

    Returns:
        fiftyone dataset object
    """
    export_dir = Path(export_dir)

    # prompt overwriting if dataset or folder exist
    if not overwrite:
        if fo.dataset_exists(to_name):
            raise ValueError(
                f"{to_name=} dataset already exists. Use --overwrite or delete it."
            )
        if export_dir.exists():
            raise ValueError(
                f"{str(export_dir)=} already exists. "
                "User --overwrite or delete it manually"
            )
    else:
        if export_dir.exists():
            shutil.rmtree(export_dir)

    dataset = load_fiftyone_dataset(dataset, **kwargs)
    label_field = parse_list_str(label_field)

    # make sure splits are present if given
    if splits:
        splits = parse_list_str(splits)
        tag_counts = dataset.count_sample_tags()
        assert all(
            [s in tag_counts for s in splits]
        ), f"{dataset.name=} does not contain all {splits=}"

    # export each label field
    for field in label_field:
        assert dataset.has_sample_field(
            field
        ), f"{dataset.name=} does not contain {field=}"
        _export_patches(dataset, field, export_dir, splits)

    # import all together, tag if needed
    new = create_fiftyone_dataset(
        name=to_name,
        src=export_dir if splits is None else None,
        dataset_type=ImageClassificationDirectoryTree,
        overwrite=overwrite,
    )
    if splits:
        for tag in splits:
            new.add_dir(
                dataset_dir=str(export_dir / tag),
                dataset_type=ImageClassificationDirectoryTree,
                tags=tag,
            )
    return new

`transpose_images(dataset, **kwargs)`

Rotate images 90 degrees.

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`**kwargs`		dataset loading filters	`{}`

Returns:

Type	Description
`fo.DatasetView`	a dataset view instance

Source code in finegrained/data/transforms.py

def transpose_images(dataset: str, **kwargs) -> fo.DatasetView:
    """Rotate images 90 degrees.

    Args:
        dataset: fiftyone dataset name
        **kwargs: dataset loading filters

    Returns:
        a dataset view instance
    """
    assert len(kwargs) > 0, "Danger: provide dataset filters"

    dataset = load_fiftyone_dataset(dataset, **kwargs)

    for smp in tqdm(dataset.select_fields("filepath"), desc="transposing"):
        Image.open(smp.filepath).transpose(Image.ROTATE_90).save(smp.filepath)

    return dataset

`finegrained.data.zoo`

Base constructs to use torchvision models.

`object_detection(dataset, label_field, conf=0.25, image_size=None, device=None, **kwargs)`

Detect COCO objects with mask-rcnn-v2 from torchvision

Parameters:

Name	Type	Description	Default
`dataset`	`str`	fiftyone dataset name	required
`label_field`	`str`	which field to write predictions to	required
`conf`	`float`	box confidence threshold	`0.25`
`image_size`		if specified, this will be a max image size (to save memory)	`None`
`**kwargs`		dataset loading filters	`{}`

Returns:

Type	Description
	None

Source code in finegrained/data/zoo.py

def object_detection(
    dataset: str,
    label_field: str,
    conf: float = 0.25,
    image_size=None,
    device=None,
    **kwargs
):
    """Detect COCO objects with mask-rcnn-v2 from torchvision

    Args:
        dataset: fiftyone dataset name
        label_field: which field to write predictions to
        conf: box confidence threshold
        image_size: if specified, this will be a max image size
            (to save memory)
        **kwargs: dataset loading filters

    Returns:
        None
    """
    # prepare the model
    weights = MaskRCNN_ResNet50_FPN_V2_Weights.DEFAULT
    model = maskrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=conf)
    device = get_device()[0] if device is None else torch.device(device)
    model.eval().to(device)
    preprocess = weights.transforms()

    dataset = load_fiftyone_dataset(dataset, **kwargs)

    with torch.no_grad():
        for smp in tqdm(dataset.select_fields("filepath"), desc="detecting"):
            # prepare image input
            img = read_image(smp.filepath)
            if image_size:
                img = _resize_image(img, target_size=image_size)
            # predict
            batch = [preprocess(img).to(device)]
            prediction, *_ = model(batch)
            # parse detection and save results
            img_h, img_w = img.size(1), img.size(2)
            detections = _parse_torchvision_detections(
                prediction, weights.meta["categories"], (img_h, img_w)
            )
            smp[label_field] = detections
            smp.save()