Data module

The data module is intended for preparing data for training. It heavily depends on the FiftyOne package and its integrations. There are the following submodules:

Submodules

  1. annotations - send data for annotation in CVAT and fetch annotations.
  2. brain - commands for fiftyone.brain module.
  3. display - print dataset-related stats to a console.
  4. export - export datasets to different formats (if missing in the original fiftyone cli).
  5. tag - tag dataset samples that meet certain criteria.
  6. transforms - perform changes on datasets.
  7. zoo - perform operations with fiftyone.zoo module.

finegrained.data.annotations

Send, query and get annotation results.

annotate(dataset, annotation_key, label_field, backend, overwrite=False, label_type=None, project_id=None, segment_size=10, task_name=None, image_quality=75, task_asignee=None, organization=None, classes=None, **kwargs)

Send samples to annotations

Parameters:

Name Type Description Default
dataset str

fiftyone dataset with samples

required
annotation_key str

assign this key for annotation run

required
label_field str

if exists, upload labels

required
label_type Optional[str]

if label_field does not exist, this has to be specified

None
backend Any

backend name or filepath to configs

required
overwrite bool

overwrite existing annotation run if True

False
classes Optional[str]

list of classes or path to labels.txt file

None
image_quality int

image upload quality

75
task_name Optional[str]

custom task name, by default dataset name + annotation key

None
segment_size int

number of frames/images per one job

10
project_id Optional[int]

which cvat project to connect to

None
task_asignee Optional[str]

assignee for the task

None
**kwargs

dataset loading filters

{}
Source code in finegrained/data/annotations.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def annotate(
    dataset: str,
    annotation_key: str,
    label_field: str,
    backend: Any,
    overwrite: bool = False,
    label_type: Optional[str] = None,
    project_id: Optional[int] = None,
    segment_size: int = 10,
    task_name: Optional[str] = None,
    image_quality: int = 75,
    task_asignee: Optional[str] = None,
    organization: Optional[str] = None,
    classes: Optional[str] = None,
    **kwargs,
):
    """Send samples to annotations

    Args:
        dataset: fiftyone dataset with samples
        annotation_key: assign this key for annotation run
        label_field: if exists, upload labels
        label_type: if label_field does not exist, this has to be specified
        backend: backend name or filepath to configs
        overwrite: overwrite existing annotation run if True
        classes: list of classes or path to labels.txt file
        image_quality: image upload quality
        task_name: custom task name, by default dataset name + annotation key
        segment_size: number of frames/images per one job
        project_id: which cvat project to connect to
        task_asignee: assignee for the task
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    backend_conf = _load_backend_config(backend)
    if dataset.has_annotation_run(annotation_key) and overwrite:
        dataset.delete_annotation_run(annotation_key)
    if not dataset.has_sample_field(label_field) and label_type is None:
        raise ValueError(
            f"{label_field=} does not exist in {dataset.name}. Specify 'label_type'"
        )
    if classes:
        classes = _parse_classes(classes)
    if task_name is None:
        task_name = dataset.name + " - " + annotation_key
    dataset.annotate(
        annotation_key,
        label_field=label_field,
        label_type=label_type,
        project_id=project_id,
        segment_size=segment_size,
        task_name=task_name,
        image_quality=image_quality,
        classes=classes,
        task_asignee=task_asignee,
        organization=organization,
        **backend_conf,
    )

delete_key(dataset, key)

Delete an annotation key.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
key str

annotation key

required

Returns:

Type Description

none

Source code in finegrained/data/annotations.py
135
136
137
138
139
140
141
142
143
144
145
146
def delete_key(dataset: str, key: str):
    """Delete an annotation key.

    Args:
        dataset: fiftyone dataset name
        key: annotation key

    Returns:
        none
    """
    dataset = load_fiftyone_dataset(dataset)
    dataset.delete_annotation_run(key)

list_keys(dataset)

List annotation keys attributed to the dataset

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required

Returns:

Type Description
types.LIST_STR

a list of keys

Source code in finegrained/data/annotations.py
121
122
123
124
125
126
127
128
129
130
131
132
def list_keys(dataset: str) -> types.LIST_STR:
    """List annotation keys attributed to the dataset

    Args:
        dataset: fiftyone dataset name

    Returns:
        a list of keys
    """
    dataset = load_fiftyone_dataset(dataset)
    keys = dataset.list_annotation_runs()
    return keys

load(dataset, annotation_key, backend, dest_field=None, dataset_kwargs=None)

Download annotations from an annotation backend.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
annotation_key str

annotation key used to send for annotations

required
backend Any

annotation backend name or filepath with configs

required
dest_field str

if given, annotations will be stored in a new field

None
dataset_kwargs Optional[dict]

dataset loading filters

None

Returns:

Type Description

none

Source code in finegrained/data/annotations.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def load(
    dataset: str,
    annotation_key: str,
    backend: Any,
    dest_field: str = None,
    dataset_kwargs: Optional[dict] = None,
):
    """Download annotations from an annotation backend.

    Args:
        dataset: fiftyone dataset name
        annotation_key: annotation key used to send for annotations
        backend: annotation backend name or filepath with configs
        dest_field: if given, annotations will be stored in a new field
        dataset_kwargs: dataset loading filters

    Returns:
        none
    """
    backend_conf = _load_backend_config(backend)
    backend_conf.pop("backend")
    dataset = load_fiftyone_dataset(
        dataset, **dataset_kwargs if bool(dataset_kwargs) else {}
    )
    dataset.load_annotations(annotation_key, dest_field=dest_field, **backend_conf)

finegrained.data.brain

Run fiftyone.brain operations on a dataset.

compute_hardness(dataset, predictions, **kwargs)

Estimate how difficult is this sample to predict.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
predictions str

field with predictions

required
**kwargs

dataset filters

{}

Returns:

Type Description

None

Source code in finegrained/data/brain.py
26
27
28
29
30
31
32
33
34
35
36
37
38
def compute_hardness(dataset: str, predictions: str, **kwargs):
    """Estimate how difficult is this sample to predict.

    Args:
        dataset: fiftyone dataset name
        predictions: field with predictions
        **kwargs: dataset filters

    Returns:
        None
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    fob.compute_hardness(dataset, predictions)

compute_mistakenness(dataset, predictions, gt_field='ground_truth', **kwargs)

Estimate a probability that a ground truth label is wrong

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
predictions str

a field that contains model predictions

required
gt_field str

a field that contains ground truth data

'ground_truth'
**kwargs

dataset loading filters

{}

Returns:

Type Description

none

Source code in finegrained/data/brain.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def compute_mistakenness(
    dataset: str, predictions: str, gt_field: str = "ground_truth", **kwargs
):
    """Estimate a probability that a ground truth label is wrong

    Args:
        dataset: fiftyone dataset name
        predictions: a field that contains model predictions
        gt_field: a field that contains ground truth data
        **kwargs: dataset loading filters

    Returns:
        none
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    fob.compute_mistakenness(dataset, predictions, label_field=gt_field)

finegrained.data.display

Display various data about datasets.

compute_area(dataset, field='area', average_size=False, overwrite_metadata=False, overwrite=False, **kwargs)

Calculate area of an image based on metadata

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
field str

field where to assign area values

'area'
average_size bool

if True, calculate (width + height)/2 instead

False
overwrite_metadata bool

whether to overwrite metadata

False
overwrite bool

delete field if already exists

False
**kwargs

dataset loading filters

{}

Returns:

Type Description
tuple[int, int]

area bounds

Source code in finegrained/data/display.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def compute_area(
    dataset: str,
    field: str = "area",
    average_size: bool = False,
    overwrite_metadata: bool = False,
    overwrite: bool = False,
    **kwargs,
) -> tuple[int, int]:
    """Calculate area of an image based on metadata

    Args:
        dataset: fiftyone dataset name
        field: field where to assign area values
        average_size: if True, calculate (width + height)/2 instead
        overwrite_metadata: whether to overwrite metadata
        overwrite: delete field if already exists
        **kwargs: dataset loading filters

    Returns:
        area bounds
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    if dataset.has_sample_field(field):
        if overwrite:
            delete_field(dataset.name, field)
        else:
            raise ValueError(f"{field=} already exists.")

    dataset.compute_metadata(overwrite=overwrite_metadata)
    for smp in dataset.select_fields("metadata"):
        val = (
            (smp.metadata.width + smp.metadata.height) / 2
            if average_size
            else smp.metadata.width * smp.metadata.height
        )
        smp[field] = val
        smp.save()

    return dataset.bounds(field)

eval_report(dataset, predictions, gt_field='ground_truth', cmat=False, eval_kwargs={}, **kwargs)

Print evaluation report: compare prediction field against ground_truth field.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
predictions str

a field with predictions

required
gt_field str

a field with ground truth labels

'ground_truth'
cmat bool

if True, plot a confusion matrix

False
eval_kwargs dict

if passed, these params will be passed to the evaluation function

{}
**kwargs

dataset loading filters

{}
Source code in finegrained/data/display.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def eval_report(
    dataset: str,
    predictions: str,
    gt_field: str = "ground_truth",
    cmat: bool = False,
    eval_kwargs: dict = {},
    **kwargs,
):
    """Print evaluation report: compare prediction field against ground_truth field.

    Args:
        dataset: fiftyone dataset name
        predictions: a field with predictions
        gt_field: a field with ground truth labels
        cmat: if True, plot a confusion matrix
        eval_kwargs: if passed, these params will be passed to the evaluation function
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    labels = get_unique_labels(dataset, gt_field)
    label_type = dataset.get_field(gt_field).document_type

    eval_fn = None
    match label_type:
        case fo.Classification:
            eval_fn = dataset.evaluate_classifications
        case fo.Detections | fo.Keypoints | fo.Polylines:
            eval_fn = dataset.evaluate_detections
        case fo.Segmentation:
            eval_fn = dataset.evaluate_segmentations
    if not eval_fn:
        raise NotImplementedError(f"evaluation for {label_type=} not implemented.")

    results = eval_fn(predictions, gt_field=gt_field, classes=labels, **eval_kwargs)
    results.print_report()
    if hasattr(results, "mAP") and (mAP := results.mAP()):
        iou = eval_kwargs.get("iou", 0.5)
        print(f"> {mAP=:.4f} @ {iou=}")
    if cmat:
        cm = results.plot_confusion_matrix(backend="matplotlib")
        cm.show()

label_diff(dataset, label_field, tags_left, tags_right)

Compute difference between two sets of labels.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

field with labels

required
tags_left types.LIST_STR_STR

list of tags for base list of labels

required
tags_right types.LIST_STR_STR

list of tags for intersection comparison

required
Source code in finegrained/data/display.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def label_diff(
    dataset: str,
    label_field: str,
    tags_left: types.LIST_STR_STR,
    tags_right: types.LIST_STR_STR,
):
    """Compute difference between two sets of labels.

    Args:
        dataset: fiftyone dataset name
        label_field: field with labels
        tags_left: list of tags for base list of labels
        tags_right: list of tags for intersection comparison
    """
    # TODO test this
    dataset = load_fiftyone_dataset(dataset)
    assert dataset.has_sample_field(label_field)
    assert len(tags_left) > 0
    assert len(tags_right) > 0

    left_labels = get_unique_labels(dataset.match_tags(tags_left), label_field)
    right_labels = get_unique_labels(dataset.match_tags(tags_right), label_field)

    diff = find_diff(left_labels, right_labels)
    return diff

print_labels(dataset, label_field, **kwargs)

Print all classes in the dataset.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

field that contains labels

required
**kwargs

dataset loading filters

{}
Source code in finegrained/data/display.py
11
12
13
14
15
16
17
18
19
20
21
def print_labels(dataset: str, label_field: str, **kwargs) -> None:
    """Print all classes in the dataset.

    Args:
        dataset: fiftyone dataset name
        label_field: field that contains labels
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    labels = get_unique_labels(dataset, label_field)
    print("\n".join(labels))

finegrained.data.export

Dataset converting and exporting utils.

to_csv(dataset, label_field, export_path, extra_fields=None, **kwargs)

Export a dataset into CSV format for uploading to external sources.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

field that contains labels (will be mapped to 'label')

required
export_path str

where to write csv file

required
extra_fields Optional[list[str]]

extra fields to be added to csv

None
**kwargs

dataset loading filters

{}
Source code in finegrained/data/export.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def to_csv(
    dataset: str,
    label_field: str,
    export_path: str,
    extra_fields: Optional[list[str]] = None,
    **kwargs,
):
    """Export a dataset into CSV format for uploading to external sources.

    Args:
        dataset: fiftyone dataset name
        label_field: field that contains labels (will be mapped to 'label')
        export_path: where to write csv file
        extra_fields: extra fields to be added to csv
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    label_field = f"{label_field}.label"
    fields = {"filepath": "image", label_field: "label"}
    if extra_fields:
        fields.update({k: k for k in extra_fields})
    dataset.export(
        dataset_type=fot.CSVDataset,
        abs_paths=True,
        export_media=False,
        labels_path=export_path,
        fields=fields,
    )

to_cvat(dataset, label_field, export_dir, **kwargs)

Export a dataset into CVAT format for annotation.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

field that contains labels

required
export_dir str

where to write data

required
**kwargs

dataset loading filters

{}
Source code in finegrained/data/export.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def to_cvat(dataset: str, label_field: str, export_dir: str, **kwargs):
    """Export a dataset into CVAT format for annotation.

    Args:
        dataset: fiftyone dataset name
        label_field: field that contains labels
        export_dir: where to write data
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    dataset.export(
        export_dir=export_dir,
        dataset_type=fot.CVATImageDataset,
        label_field=label_field,
    )

to_yolov5(dataset, label_field, export_dir, splits, **kwargs)

Export a dataset into yolov5 format for training

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

field that contains labels

required
export_dir str

where to write data

required
splits List[str]

which splits to export

required
**kwargs

dataset loading filters

{}
Source code in finegrained/data/export.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def to_yolov5(
    dataset: str,
    label_field: str,
    export_dir: str,
    splits: List[str],
    **kwargs,
):
    """Export a dataset into yolov5 format for training

    Args:
        dataset: fiftyone dataset name
        label_field: field that contains labels
        export_dir: where to write data
        splits: which splits to export
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    labels = get_unique_labels(dataset, label_field)
    for tag in splits:
        subset = dataset.match_tags(tag)
        assert len(subset) > 0, f"No samples in the subset with {tag=}"
        subset.export(
            export_dir=export_dir,
            split=tag,
            dataset_type=fot.YOLOv5Dataset,
            label_field=label_field,
            classes=labels,
        )

finegrained.data.tag

Tag or untag samples with specific filters or condition

retag_missing_labels(dataset, label_field, from_tags, to_tags)

Remove from_tags and add to_tags for labels that are present in from_tags but absent in to_tags.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

a label field

required
from_tags types.LIST_STR_STR

tags with base list of class labels

required
to_tags types.LIST_STR_STR

tags with intersection of class labels

required

Returns:

Type Description
dict

a count of sample tags for a subset

Source code in finegrained/data/tag.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def retag_missing_labels(
    dataset: str,
    label_field: str,
    from_tags: types.LIST_STR_STR,
    to_tags: types.LIST_STR_STR,
) -> dict:
    """Remove from_tags and add to_tags for labels that are present in
        from_tags but absent in to_tags.

    Args:
        dataset: fiftyone dataset name
        label_field: a label field
        from_tags: tags with base list of class labels
        to_tags: tags with intersection of class labels

    Returns:
        a count of sample tags for a subset
    """
    # TODO test this
    diff = label_diff(dataset, label_field, tags_left=from_tags, tags_right=to_tags)
    assert len(diff) > 0, "No samples to retag"

    dataset = load_fiftyone_dataset(dataset, include_labels={label_field: diff})
    dataset.untag_samples(from_tags)
    dataset.tag_samples(to_tags)

    return dataset.count_sample_tags()

split_classes(dataset, label_field, train_size=0.5, val_size=0.5, min_samples=3, split_names=('train', 'val'), overwrite=False)

Split classes in a dataset into train and val.

Used for meta-learning.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

which field to use for classes

required
train_size float

fraction of classes to tag as train

0.5
val_size float

fraction of classes to tag as val

0.5
min_samples int

minimum number of samples per class to include a class into a split

3
split_names tuple[str, str]

splits will be tagged with these names

('train', 'val')
overwrite bool

if True, existing tags are removed

False

Returns:

Type Description
types.DICT_STR_FLOAT

a dict of tag counts

Source code in finegrained/data/tag.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def split_classes(
    dataset: str,
    label_field: str,
    train_size: float = 0.5,
    val_size: float = 0.5,
    min_samples: int = 3,
    split_names: tuple[str, str] = ("train", "val"),
    overwrite: bool = False,
) -> types.DICT_STR_FLOAT:
    """Split classes in a dataset into train and val.

    Used for meta-learning.

    Args:
        dataset: fiftyone dataset name
        label_field: which field to use for classes
        train_size: fraction of classes to tag as train
        val_size: fraction of classes to tag as val
        min_samples: minimum number of samples
            per class to include a class into a split
        split_names: splits will be tagged with these names
        overwrite: if True, existing tags are removed

    Returns:
        a dict of tag counts
    """
    dataset = load_fiftyone_dataset(dataset)
    label_counts = dataset.count_values(f"{label_field}.label")
    labels = list(filter(lambda x: label_counts[x] >= min_samples, label_counts))
    train_labels, val_labels = train_test_split(
        labels, test_size=val_size, train_size=train_size, shuffle=True
    )
    if overwrite:
        dataset.untag_samples(split_names)
    train_view = dataset.filter_labels(label_field, F("label").is_in(train_labels))
    train_view.tag_samples(split_names[0])
    val_view = dataset.filter_labels(label_field, F("label").is_in(val_labels))
    val_view.tag_samples(split_names[1])
    return dataset.count_sample_tags()

split_dataset(dataset, splits={'train': 0.8, 'val': 0.1, 'test': 0.1}, **kwargs)

Create data split tags for a dataset

Parameters:

Name Type Description Default
dataset str

fiftyone dataset

required
splits types.DICT_STR_FLOAT

a dict of split names and relative sizes

{'train': 0.8, 'val': 0.1, 'test': 0.1}
kwargs

dataset loading filters

{}

Returns:

Type Description

a dict of split counts

Source code in finegrained/data/tag.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def split_dataset(
    dataset: str,
    splits: types.DICT_STR_FLOAT = {"train": 0.8, "val": 0.1, "test": 0.1},
    **kwargs,
):
    """Create data split tags for a dataset

    Args:
        dataset: fiftyone dataset
        splits: a dict of split names and relative sizes
        kwargs: dataset loading filters

    Returns:
        a dict of split counts
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    four.random_split(dataset, splits)
    return dataset.count_sample_tags()

tag_alignment(dataset, vertical=True, tag=None, **kwargs)

Add a vertical/horizontal tag each sample.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
vertical bool

if True, vertical images are tagged. If False, horizontal images are tagged.

True
tag Optional[str]

overwrite default 'vertical' or 'horizontal' tag.

None
**kwargs

dataset filter kwargs

{}

Returns:

Type Description
dict

a dict with sample tag counts

Source code in finegrained/data/tag.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def tag_alignment(
    dataset: str, vertical: bool = True, tag: Optional[str] = None, **kwargs
) -> dict:
    """Add a vertical/horizontal tag each sample.

    Args:
        dataset: fiftyone dataset name
        vertical: if True, vertical images are tagged.
            If False, horizontal images are tagged.
        tag: overwrite default 'vertical' or 'horizontal' tag.
        **kwargs: dataset filter kwargs

    Returns:
        a dict with sample tag counts
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    dataset.compute_metadata()
    if vertical:
        tag = "vertical" if tag is None else tag
        tag_view = dataset.match(F("metadata.height") > F("metadata.width"))
    else:
        tag = "horizontal" if tag is None else tag
        tag_view = dataset.match(F("metadata.width") >= F("metadata.height"))
    tag_view.tag_samples(tag)
    return tag_view.count_sample_tags()

tag_labels(dataset, label_field, labels, tags)

Tag labels with given tags.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

a label field

required
labels types.LIST_STR_STR

labels to filter, can be a txt file with labels

required
tags types.LIST_STR_STR

tags to apply

required

Returns:

Type Description
dict

a count of label tags for a subset

Source code in finegrained/data/tag.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def tag_labels(
    dataset: str,
    label_field: str,
    labels: types.LIST_STR_STR,
    tags: types.LIST_STR_STR,
) -> dict:
    """Tag labels with given tags.

    Args:
        dataset: fiftyone dataset name
        label_field: a label field
        labels: labels to filter, can be a txt file with labels
        tags: tags to apply

    Returns:
        a count of label tags for a subset
    """
    if (lab := Path(labels)).is_file():
        labels = lab.read_text().strip().split("\n")
    dataset = load_fiftyone_dataset(dataset, include_labels={label_field: labels})
    dataset.tag_labels(tags, label_fields=label_field)
    return dataset.count_label_tags(label_fields=label_field)

tag_samples(dataset, tags, **kwargs)

Tag each sample in dataset with given tags

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
tags types.LIST_STR_STR

tags to apply

required
kwargs

dataset loading kwargs, i.e. filters

{}

Returns:

Type Description
dict

a dict of sample tag counts

Source code in finegrained/data/tag.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def tag_samples(dataset: str, tags: types.LIST_STR_STR, **kwargs) -> dict:
    """Tag each sample in dataset with given tags

    Args:
        dataset: fiftyone dataset name
        tags: tags to apply
        kwargs: dataset loading kwargs, i.e. filters

    Returns:
        a dict of sample tag counts
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    dataset.tag_samples(parse_list_str(tags))
    return dataset.count_sample_tags()

finegrained.data.transforms

Data transforms on top of fiftyone datasets.

combine_datasets(dest_name, label_field, cfg, persistent=True, overwrite=False)

Create a new dataset by adding samples from multiple datasets.

List of datasets and filters are specified in a yaml config file. Source label fields will be renamed to a destination label field.

Parameters:

Name Type Description Default
dest_name str

a new dataset name

required
label_field str

a new label field

required
cfg str

path to yaml config

required
persistent bool

whether to persist destination dataset (False for testing)

True
overwrite bool

if dataset exists, overwrite it

False

Returns:

Type Description

a dataset instance

Source code in finegrained/data/transforms.py
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
def combine_datasets(
    dest_name: str,
    label_field: str,
    cfg: str,
    persistent: bool = True,
    overwrite: bool = False,
):
    """Create a new dataset by adding samples from multiple datasets.

    List of datasets and filters are specified in a yaml config file.
    Source label fields will be renamed to a destination label field.

    Args:
        dest_name: a new dataset name
        label_field: a new label field
        cfg: path to yaml config
        persistent: whether to persist destination dataset (False for testing)
        overwrite: if dataset exists, overwrite it

    Returns:
        a dataset instance
    """
    cfg = read_yaml(cfg)
    assert "datasets" in cfg and isinstance(dataset_cfg := cfg["datasets"], list)
    assert len(dataset_cfg) > 0

    dataset = create_fiftyone_dataset(
        dest_name, src=None, persistent=persistent, overwrite=overwrite
    )
    for one in dataset_cfg:
        assert "name" in one and isinstance(one["name"], str)
        if "filters" in one:
            assert isinstance(one["filters"], dict)
        else:
            one["filters"] = {}
        assert "label_field" in one and isinstance(one["label_field"], str)

        temp_name = f"{dest_name}_{one['name']}"
        temp = load_fiftyone_dataset(one["name"], **one["filters"]).clone(
            name=temp_name, persistent=False
        )
        temp.clone_sample_field(one["label_field"], label_field)
        if "tags" in one:
            temp.tag_samples(one["tags"])

        dataset.add_samples(temp.select_fields([label_field, "tags"]))
        fo.delete_dataset(temp_name)

    return dataset

delete_field(dataset, fields)

Delete one or more fields from a dataset

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
fields types.LIST_STR_STR

fields to delete

required

Returns:

Type Description

a fiftyone dataset

Source code in finegrained/data/transforms.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def delete_field(dataset: str, fields: types.LIST_STR_STR):
    """Delete one or more fields from a dataset

    Args:
        dataset: fiftyone dataset name
        fields: fields to delete

    Returns:
        a fiftyone dataset
    """
    dataset = load_fiftyone_dataset(dataset)
    fields = parse_list_str(fields)
    for field in fields:
        dataset.delete_sample_field(field)
        print(f"{field=} deleted from {dataset.name=}")
    return dataset

delete_samples(dataset, **kwargs)

Delete samples and associated files from a dataset

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
**kwargs

dataset filters to select samples for deletion (must be provided)

{}

Returns:

Type Description

None

Source code in finegrained/data/transforms.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def delete_samples(dataset: str, **kwargs):
    """Delete samples and associated files from a dataset

    Args:
        dataset: fiftyone dataset name
        **kwargs: dataset filters to select samples for deletion
            (must be provided)

    Returns:
        None
    """
    assert bool(kwargs), "Danger: provide dataset filters to select a subset"
    subset = load_fiftyone_dataset(dataset, **kwargs)
    delete_ids = []
    for smp in subset.select_fields(["id", "filepath"]):
        Path(smp.filepath).unlink()
        delete_ids.append(smp.id)

    full_dataset = fo.load_dataset(dataset)
    full_dataset.delete_samples(delete_ids)
    print(f"{len(delete_ids)} files deleted and removed from {dataset=}")

exif_transpose(dataset, **kwargs)

Rotate images that have a PIL rotate tag

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
**kwargs

dataset loading filters

{}

Returns:

Type Description

None

Source code in finegrained/data/transforms.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def exif_transpose(dataset: str, **kwargs):
    """Rotate images that have a PIL rotate tag

    Args:
        dataset: fiftyone dataset name
        **kwargs: dataset loading filters

    Returns:
        None
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    for smp in tqdm(dataset.select_fields("filepath"), desc="transposing"):
        try:
            orig = Image.open(smp.filepath)
            transposed = ImageOps.exif_transpose(orig)
            transposed.save(smp.filepath)
        except UnidentifiedImageError as e:
            print(e, "at", smp.filepath)

fix_filepath(src, from_dir, to_dir)

Replace from_dir part to to_dir in each sample's filepath in samples.json file.

Samples.json file is updated inplace.

Parameters:

Name Type Description Default
src str

sample.json file export for fiftyone.types.FiftyOneDataset

required
from_dir str

relative directory to replace

required
to_dir str

new relative directory

required
Source code in finegrained/data/transforms.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
def fix_filepath(src: str, from_dir: str, to_dir: str) -> None:
    """Replace from_dir part to to_dir in each sample's filepath in samples.json file.

    Samples.json file is updated inplace.

    Args:
        src: sample.json file export for fiftyone.types.FiftyOneDataset
        from_dir: relative directory to replace
        to_dir: new relative directory
    """
    # TODO test this
    src = Path(src)
    assert src.exists() and src.suffix == ".json"
    samples = read_json(src)

    to_dir = Path(to_dir)

    def fix_path(path):
        return str(to_dir / Path(path).relative_to(from_dir))

    for smp in samples["samples"]:
        smp["filepath"] = fix_path(smp["filepath"])

    write_json(samples, src)

from_label_tag(dataset, label_field, label_tag, **kwargs)

Update a label_field label with its label_tag.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

a field that contains detections labels.

required
label_tag str

labels that contain this tag, will be renamed to it.

required
**kwargs

dataset loading filters

{}

Returns:

Type Description
dict

updated label values

Source code in finegrained/data/transforms.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
def from_label_tag(dataset: str, label_field: str, label_tag: str, **kwargs) -> dict:
    """Update a label_field label with its label_tag.

    Args:
        dataset: fiftyone dataset name
        label_field: a field that contains detections labels.
        label_tag: labels that contain this tag, will be renamed to it.
        **kwargs: dataset loading filters

    Returns:
        updated label values
    """
    kwargs = kwargs | {"label_tags": label_tag}
    dataset = load_fiftyone_dataset(dataset, **kwargs)

    for smp in tqdm(dataset.select_fields(label_field), desc="updating samples"):
        for det in smp[label_field].detections:
            if label_tag in det.tags:
                det.label = label_tag
                smp.save()

    return dataset.count_values(f"{label_field}.detections.label")

from_labels(dataset, label_field, from_field, **kwargs)

Re-assign classification label to detection labels.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

a field with detections to be updated

required
from_field str

a field with classification to get labels from

required
**kwargs

dataset loading filters

{}
Source code in finegrained/data/transforms.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
def from_labels(dataset: str, label_field: str, from_field: str, **kwargs):
    """Re-assign classification label to detection labels.

    Args:
        dataset: fiftyone dataset name
        label_field: a field with detections to be updated
        from_field: a field with classification to get labels from
        **kwargs: dataset loading filters
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)
    dataset = dataset.exists(label_field)

    assert dataset.has_sample_field(
        label_field
    ), f"Dataset does not contain {label_field=}."
    assert (
        doc_type := dataset.get_field(label_field).document_type
    ) == fo.Detections, f"{label_field=} has to be of type Detections, got {doc_type=}."
    assert dataset.has_sample_field(
        from_field
    ), f"Dataset does not contain {from_field=}."
    assert (
        doc_type := dataset.get_field(from_field).document_type
    ) == fo.Classification, (
        f"{from_field=} has to be of type Detections, got {doc_type=}."
    )

    for smp in tqdm(dataset.select_fields([label_field, from_field])):
        _update_labels(smp[label_field], smp[from_field].label)
        smp.save()

map_labels(dataset, from_field, to_field, label_mapping=None, overwrite=False, **kwargs)

Create a new dataset field with mapped labels.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
from_field str

source label field

required
to_field str

a new label field

required
label_mapping Optional[dict]

label mapping (use {}/None for creating a field copy)

None
overwrite bool

if to_field already exists, then overwrite it

False
**kwargs

dataset loading kwargs

{}

Returns:

Type Description
fo.DatasetView

dataset view

Source code in finegrained/data/transforms.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
def map_labels(
    dataset: str,
    from_field: str,
    to_field: str,
    label_mapping: Optional[dict] = None,
    overwrite: bool = False,
    **kwargs,
) -> fo.DatasetView:
    """Create a new dataset field with mapped labels.

    Args:
        dataset: fiftyone dataset name
        from_field: source label field
        to_field: a new label field
        label_mapping: label mapping (use {}/None for creating a field copy)
        overwrite: if to_field already exists, then overwrite it
        **kwargs: dataset loading kwargs

    Returns:
        dataset view
    """
    dataset = load_fiftyone_dataset(dataset, **kwargs)

    if overwrite and dataset.has_sample_field(to_field):
        delete_field(dataset.dataset_name, to_field)
    elif not overwrite and dataset.has_sample_field(to_field):
        raise ValueError(f"{to_field=} already exists")

    dataset.clone_sample_field(from_field, to_field)
    if bool(label_mapping):
        dataset = dataset.map_labels(to_field, label_mapping)
        dataset.save(to_field)
    return dataset

merge_diff(dataset, image_dir, tags=None, recursive=True)

Merge new files into an existing dataset.

Existing files will be skipped. No labels for new files are expected. Merger happens based on an absolute filepath.

Parameters:

Name Type Description Default
dataset str

existing fiftyone dataset

required
image_dir str

a folder with new files

required
tags types.LIST_STR_STR

tag new samples

None
recursive bool

search for files in subfolders as well

True

Returns:

Type Description

an updated fiftyone dataset

Source code in finegrained/data/transforms.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def merge_diff(
    dataset: str,
    image_dir: str,
    tags: types.LIST_STR_STR = None,
    recursive: bool = True,
):
    """Merge new files into an existing dataset.

    Existing files will be skipped.
    No labels for new files are expected.
    Merger happens based on an absolute filepath.

    Args:
        dataset: existing fiftyone dataset
        image_dir: a folder with new files
        tags: tag new samples
        recursive: search for files in subfolders as well

    Returns:
        an updated fiftyone dataset
    """
    dataset = load_fiftyone_dataset(dataset)
    second = fo.Dataset.from_images_dir(image_dir, tags=tags, recursive=recursive)
    dataset.merge_samples(second, skip_existing=True)
    return dataset

prefix_label(dataset, label_field, dest_field, prefix)

Prepend each label with given prefix

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

a field with class labels

required
dest_field str

a new field to create with '_

required
prefix str

a prefix value

required

Returns:

Type Description

fiftyone dataset object

Source code in finegrained/data/transforms.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def prefix_label(dataset: str, label_field: str, dest_field: str, prefix: str):
    """Prepend each label with given prefix

    Args:
        dataset: fiftyone dataset name
        label_field: a field with class labels
        dest_field: a new field to create with '<prefix>_<label>' values
        prefix: a prefix value

    Returns:
        fiftyone dataset object
    """
    dataset = load_fiftyone_dataset(dataset)
    values = [
        fo.Classification(label=f"{prefix}_{smp[label_field].label}")
        for smp in dataset.select_fields(label_field)
    ]
    dataset.set_values(dest_field, values)
    return dataset

to_patches(dataset, label_field, to_name, export_dir, overwrite=False, splits=None, **kwargs)

Crop out patches from a dataset and create a new one.

Parameters:

Name Type Description Default
dataset str

a fiftyone dataset with detections

required
label_field str | list[str]

label field(s) with detection, classification or polylines

required
to_name str

a new dataset name for patches

required
export_dir str

where to save crops

required
overwrite bool

if True and that name already exists, delete it

False
splits Optional[list[str]]

if provided, these tags will be used to split patches into subsets

None
**kwargs

dataset filters

{}

Returns:

Type Description
fo.Dataset

fiftyone dataset object

Source code in finegrained/data/transforms.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def to_patches(
    dataset: str,
    label_field: str | list[str],
    to_name: str,
    export_dir: str,
    overwrite: bool = False,
    splits: Optional[list[str]] = None,
    **kwargs,
) -> fo.Dataset:
    """Crop out patches from a dataset and create a new one.

    Args:
        dataset: a fiftyone dataset with detections
        label_field: label field(s) with detection, classification or polylines
        to_name: a new dataset name for patches
        export_dir: where to save crops
        overwrite: if True and that name already exists, delete it
        splits: if provided, these tags will be used to split patches into subsets
        **kwargs: dataset filters

    Returns:
        fiftyone dataset object
    """
    export_dir = Path(export_dir)

    # prompt overwriting if dataset or folder exist
    if not overwrite:
        if fo.dataset_exists(to_name):
            raise ValueError(
                f"{to_name=} dataset already exists. Use --overwrite or delete it."
            )
        if export_dir.exists():
            raise ValueError(
                f"{str(export_dir)=} already exists. "
                "User --overwrite or delete it manually"
            )
    else:
        if export_dir.exists():
            shutil.rmtree(export_dir)

    dataset = load_fiftyone_dataset(dataset, **kwargs)
    label_field = parse_list_str(label_field)

    # make sure splits are present if given
    if splits:
        splits = parse_list_str(splits)
        tag_counts = dataset.count_sample_tags()
        assert all(
            [s in tag_counts for s in splits]
        ), f"{dataset.name=} does not contain all {splits=}"

    # export each label field
    for field in label_field:
        assert dataset.has_sample_field(
            field
        ), f"{dataset.name=} does not contain {field=}"
        _export_patches(dataset, field, export_dir, splits)

    # import all together, tag if needed
    new = create_fiftyone_dataset(
        name=to_name,
        src=export_dir if splits is None else None,
        dataset_type=ImageClassificationDirectoryTree,
        overwrite=overwrite,
    )
    if splits:
        for tag in splits:
            new.add_dir(
                dataset_dir=str(export_dir / tag),
                dataset_type=ImageClassificationDirectoryTree,
                tags=tag,
            )
    return new

transpose_images(dataset, **kwargs)

Rotate images 90 degrees.

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
**kwargs

dataset loading filters

{}

Returns:

Type Description
fo.DatasetView

a dataset view instance

Source code in finegrained/data/transforms.py
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def transpose_images(dataset: str, **kwargs) -> fo.DatasetView:
    """Rotate images 90 degrees.

    Args:
        dataset: fiftyone dataset name
        **kwargs: dataset loading filters

    Returns:
        a dataset view instance
    """
    assert len(kwargs) > 0, "Danger: provide dataset filters"

    dataset = load_fiftyone_dataset(dataset, **kwargs)

    for smp in tqdm(dataset.select_fields("filepath"), desc="transposing"):
        Image.open(smp.filepath).transpose(Image.ROTATE_90).save(smp.filepath)

    return dataset

finegrained.data.zoo

Base constructs to use torchvision models.

object_detection(dataset, label_field, conf=0.25, image_size=None, device=None, **kwargs)

Detect COCO objects with mask-rcnn-v2 from torchvision

Parameters:

Name Type Description Default
dataset str

fiftyone dataset name

required
label_field str

which field to write predictions to

required
conf float

box confidence threshold

0.25
image_size

if specified, this will be a max image size (to save memory)

None
**kwargs

dataset loading filters

{}

Returns:

Type Description

None

Source code in finegrained/data/zoo.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def object_detection(
    dataset: str,
    label_field: str,
    conf: float = 0.25,
    image_size=None,
    device=None,
    **kwargs
):
    """Detect COCO objects with mask-rcnn-v2 from torchvision

    Args:
        dataset: fiftyone dataset name
        label_field: which field to write predictions to
        conf: box confidence threshold
        image_size: if specified, this will be a max image size
            (to save memory)
        **kwargs: dataset loading filters

    Returns:
        None
    """
    # prepare the model
    weights = MaskRCNN_ResNet50_FPN_V2_Weights.DEFAULT
    model = maskrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=conf)
    device = get_device()[0] if device is None else torch.device(device)
    model.eval().to(device)
    preprocess = weights.transforms()

    dataset = load_fiftyone_dataset(dataset, **kwargs)

    with torch.no_grad():
        for smp in tqdm(dataset.select_fields("filepath"), desc="detecting"):
            # prepare image input
            img = read_image(smp.filepath)
            if image_size:
                img = _resize_image(img, target_size=image_size)
            # predict
            batch = [preprocess(img).to(device)]
            prediction, *_ = model(batch)
            # parse detection and save results
            img_h, img_w = img.size(1), img.size(2)
            detections = _parse_torchvision_detections(
                prediction, weights.meta["categories"], (img_h, img_w)
            )
            smp[label_field] = detections
            smp.save()