Skip to content

Commit

Permalink
Add evaluation scripts for segmentation approaches
Browse files Browse the repository at this point in the history
  • Loading branch information
karstenBehrendt committed Jul 4, 2019
1 parent 75e856a commit 29cbeb8
Show file tree
Hide file tree
Showing 7 changed files with 373 additions and 6 deletions.
Empty file added evaluation/__init__.py
Empty file.
168 changes: 168 additions & 0 deletions evaluation/evaluate_segmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/usr/bin/env python3
""" The evaluation script for the segmentation part of the unsupervised
llamas dataset.
It calculates AUC, and best precision-recall combinations for each class.
The script expects all images to be named according to the label files, i.e.,
recording_folder/label_file.json + '_' + {class integer} + '.png'
The class integers / enums are:
0: background
1: l1
2: l0
3: r0
4: r1
In the binary case 1 is enough for the evaluation.
An example image path for r0 (first marker to the right) is:
/PATH_TO_FOLDER/llamas/trained_nets/2019_03_03__17_53_39_multi_marker_net_gradients/
markers-1456725_test/images-2014-12-22-13-22-35_mapping_280S_2nd_lane/
1419283521_0744236000.json_3.png
Use png files for lossless compression.
Files are stored for individual channels because it's easy. Four channel images
would not be an issue but after that it may not be too straightforward.
Make sure to scale predictions from 0 to 255 when storing as image.
cv2.imwrite may write zeros and ones only for a given float as dtype with values
between 0 and one, even though cv2.imshow visualizes it correctly.
Usage:
python3 evaluate_segmentation.py \
--inference_folder folder_with_stored_inference_images
--multi_class (optional if it is not binary)
"""
# TODO Needs to be tested and needs docstrings
# TODO The binary and multi_class evaluation can probably be combined
# by just checking which files exist
# TODO The multithreading call can be implemented in a cleaner way

import argparse
import concurrent.futures
import os

import cv2
import tqdm

from unsupervised_llamas.evaluation import segmentation_metrics
from unsupervised_llamas.label_scripts import dataset_constants
from unsupervised_llamas.label_scripts import helper_scripts
from unsupervised_llamas.label_scripts import segmentation_labels


def binary_eval_single_image(inputs):
# Single argument call for the threaded function.
# This can probably be implemented in a cleaner way.
return single_threaded_binary_eval_single_image(inputs[0], inputs[1])


def multi_eval_single_image(inputs):
# Single argument call for the threaded function.
# This can probably be implemented in a cleaner way.
return single_threaded_multi_eval_single_image(inputs[0], inputs[1])


def single_threaded_multi_eval_single_image(label_path, segmentation_folder):
target = segmentation_labels.create_multi_class_segmentation_label(label_path)

results = {}
for i in range(5):
# TODO Needs to be adapted for more cases farther lanes
# Currently (in order) background, l1, l0, r0, r1
segmentation_path = os.path.join(
segmentation_folder,
helper_scripts.get_label_base(label_path)) + '_{}.png'.format(i)

segmentation = cv2.imread(segmentation_path, cv2.IMREAD_GRAYSCALE).astype(float) / 255
results[i] = segmentation_metrics.binary_approx_auc(segmentation, target[:, :, i])

return results


def single_threaded_binary_eval_single_image(label_path, segmentation_folder):
target = segmentation_labels.create_binary_segmentation_label(label_path)

segmentation_path = os.path.join(
segmentation_folder,
helper_scripts.get_label_base(label_path)) + '_1.png'
segmentation = cv2.imread(segmentation_path, cv2.IMREAD_GRAYSCALE).astype(float) / 255

results = segmentation_metrics.binary_approx_auc(segmentation, target)
return results


def evaluate_set(segmentation_folder, eval_function, dataset_split='test', max_workers=8):
""" Runs evaluation for a given image folder
Parameters
----------
segmentation_folder : str
folder with predictions / inference images according to docstring
eval_function : function
Currently the binary or multi-class evaluation function
dataset_split : str
'train', 'valid', or 'test'. Calculates metrics for that split.
max_workers : int
Number of threads to use
Returns
-------
Dictionary with AP for each class and best precision-recall combination
Raises
------
IOError if inference image does not exist for a sample in the defined split
Notes
-----
Use max_workers=1 for single threaded call. This makes debugging a lot easier.
"""
label_folder = os.path.join(dataset_constants.LABELS, dataset_split)
if not os.path.isdir(label_folder):
raise IOError('Could not find labels for split {} at {}'.format(
dataset_split, label_folder))
label_paths = helper_scripts.get_labels(dataset_split)

if not os.path.isdir(segmentation_folder):
raise IOError('Could not find segmentation folder at', segmentation_folder)

# This still takes a couple of hours.
eval_dicts = {}
if max_workers > 1:
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
for label_path, single_eval in tqdm.tqdm(
zip(label_paths, executor.map(
eval_function, zip(label_paths, [segmentation_folder] * len(label_paths)))),
desc='Scoring test samples', total=len(label_paths)):
eval_dicts[label_path] = single_eval
else: # mainly for debugging
for label_path in tqdm.tqdm(
label_paths, desc='Scoring test samples', total=len(label_paths)):
eval_dicts[label_path] = eval_function((label_path, segmentation_folder))

# The reduce step. Calculates averages
eval_keys = list(list(eval_dicts.values())[0].keys())
averaged_results = {key: 0 for key in eval_keys}
for eval_dict in eval_dicts.values():
for key, value in eval_dict.items():
averaged_results[key] += value
for key in eval_keys:
averaged_results[key] /= len(label_paths)

print(segmentation_folder, '\n', averaged_results)
return averaged_results


def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--inference_folder', type=str, required=True,
help='Folder of inference images, see docstring')
parser.add_argument('--multi_class', action='store_true')
return parser.parse_args()


if __name__ == '__main__':
args = parse_args()
eval_function = multi_eval_single_image if args.multi_class else binary_eval_single_image
evaluate_set(args.inference_folder, eval_function)
47 changes: 47 additions & 0 deletions evaluation/fix_inference_output_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python
"""
A quick script to adapt to the naming schema of the evaluation scripts.
Not needed if files are named according to the evaluation scripts.
"""
import argparse
import os

import tqdm

from unsupervised_llamas.label_scripts import helper_scripts


def fix_names(input_folder, input_string, output_string):
""" Changes all names within folder according to parameters
Parameters
----------
input_folder : str
folder containing inference images
input_string : str
substring to be replace within each image
output_string : str
what the input_string should be
Notes
-----
This function is only needed if the scripts don't follow the
expected naming conventions in the first place.
"""
segmentation_images = helper_scripts.get_files_from_folder(input_folder, '.png')
for segmentation_image in tqdm.tqdm(segmentation_images, desc='renaming images'):
output_path = segmentation_image.replace(input_string, output_string)
os.rename(segmentation_image, output_path)


def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--input_folder', type=str, required=True)
parser.add_argument('--input_string', type=str, required=True)
parser.add_argument('--output_string', type=str, required=True)
return parser.parse_args()


if __name__ == '__main__':
args = parse_args()
fix_names(args.input_folder, args.input_string, args.output_string)
140 changes: 140 additions & 0 deletions evaluation/segmentation_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Calculates
true positives (tp)
false positives (fp)
true negatives (tn)
false negatives (fn)
precision
recall
average precision / AUC / PR curves
Additional metrics are welcome
One problem with lane marker segmentation is that the absolute number of correctly
classified pixels often is not helpful because background pixels far outweigh
the lane marker pixels. In absolute terms, marking all pixels as background likely
is the best solution but not helpful for the problem at hand.
Notes
-----
Don't use Python2. There may be integer divisions that I missed.
Options for calculating AUC / Precision Recall curve
1)
It may be faster to sort (prediction, label) pixels by probability and
go through those. O(n log n) in the amount of pixels per image.
Sorting takes about .36 seconds on my current system.
Expected speedup should be about 50%
2)
Bucket sort is possible as well. O(n) to put probabilities into k buckets.
o(n) to calculate the poc / auc. May be faster than using sort().
Sort however may be implemented in C. Still an approximation, as 3).
3) * current implementation. It was easy and can be replaced any time.
O(k * n), k being the amount of threshold steps,
which is not as accurate but may leverage the c/c++ numpy backend.
tp/tn/fp/fn take about one second to calculate
"""
# NOTE There should be tests

import numpy


def _debug_view(prediction, label):
""" Shows prediction and label for visual debugging """
prediction = (prediction * 255).astype(numpy.uint8)
label = (label * 255).astype(numpy.uint8)
c = numpy.zeros((717, 1276), dtype=numpy.uint8)

debug_image = numpy.stack((prediction, label, c), axis=-1)
import cv2 # Not forcing cv2 dependency for metrics
cv2.imshow('debug_image', debug_image)
cv2.waitKey(1000)


def thresholded_binary(prediction, threshold):
""" Thresholds prediction to 0 and 1 according to threshold """
return (prediction >= threshold).astype(int)


def true_positive(prediction, label):
""" Calculates number of correctly classified foreground pixels """
num_tp = numpy.sum(numpy.logical_and(label != 0, prediction == label))
return num_tp


def false_positive(prediction, label):
""" Calculates number of incorrectly predicted foreground pixels """
num_fp = numpy.sum(numpy.logical_and(label == 0, prediction != 0))
return num_fp


def true_negative(prediction, label):
""" Calculates number of correctly identified background pixels """
num_tn = numpy.sum(numpy.logical_and(label == 0, prediction == label))
return num_tn


def false_negative(prediction, label):
""" Calculates number of missed foreground pixels """
num_fn = numpy.sum(numpy.logical_and(label != 0, prediction == 0))
return num_fn


def binary_approx_auc(prediction, label):
""" Calculates approximated auc and best precision-recall combination
Parameters
----------
prediction : numpy.ndarray
raw prediction output in [0, 1]
label : numpy.ndarray
target / label, values are either 0 or 1
Returns
-------
Dict of approximate AUC, "corner" precision, "corner" recall
{'precision', 'recall', 'auc'}
Notes
-----
See docstring for alternative implementation options
Approximated by 100 uniform thresholds between 0 and 1
"""
# NOTE May achieve speedup by checking if label is all zeros
num_steps = 100
auc_value = 0

# Most upper right precision, recall point
corner_precision = 0
corner_recall = 0
corner_auc = 0

precisions = [1]
recalls = [0]

# Individual precision recall evaluation for those steps
for i in range(num_steps + 1):
threshold = (num_steps - i) / num_steps
thresholded_prediction = thresholded_binary(prediction, threshold)

# tn = true_negative(thresholded_prediction, label)
tp = true_positive(thresholded_prediction, label)
fn = false_negative(thresholded_prediction, label)
fp = false_positive(thresholded_prediction, label)

precision = 0 if (tp + fp) == 0 else tp / (tp + fp)
recall = 0 if (tp + fn) == 0 else tp / (tp + fn)

if (precision * recall) > corner_auc:
corner_auc = precision * recall
corner_precision = precision
corner_recall = recall

precisions.append(precision)
recalls.append(recall)

auc_value += (recalls[-1] - recalls[-2]) * precisions[-2]

return {'recall': corner_recall, 'precision': corner_precision, 'auc': auc_value}
4 changes: 2 additions & 2 deletions label_scripts/dataset_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@
# Multi-class segmentation colors for the individual lanes
# The names are based on the camera location, e.g. the markers
# from r2 divide the first lane to the right from the second to the right
DCOLORS = [(75, 25, 230), (75, 180, 60), (200, 130, 0), (48, 130, 245), (180, 30, 145),
DCOLORS = [(110, 30, 30), (75, 25, 230), (75, 180, 60), (200, 130, 0), (48, 130, 245), (180, 30, 145),
(0, 0, 255), (24, 140, 34), (255, 0, 0), (0, 255, 255), # the main ones
(40, 110, 170), (200, 250, 255), (255, 190, 230), (0, 0, 128), (195, 255, 170),
(0, 128, 128), (195, 255, 170), (75, 25, 230)]
LANE_NAMES = ['l6', 'l5', 'l4', 'l3', 'l2',
LANE_NAMES = ['l7', 'l6', 'l5', 'l4', 'l3', 'l2',
'l1', 'l0', 'r0', 'r1',
'r2', 'r3', 'r4', 'r5',
'r6', 'r7', 'r8']
Expand Down
13 changes: 13 additions & 0 deletions label_scripts/helper_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
"""

import os
from unsupervised_llamas.label_scripts import dataset_constants


def get_labels(split='test'):
""" Gets label files of specified dataset split """
label_paths = get_files_from_folder(
os.path.join(dataset_constants.LABELS, split), '.json')
return label_paths


def get_files_from_folder(directory, extension=None):
Expand All @@ -15,3 +23,8 @@ def get_files_from_folder(directory, extension=None):
if extension is not None:
label_files = list(filter(lambda x: x.endswith(extension), label_files))
return label_files


def get_label_base(label_path):
""" Gets directory independent label path """
return '/'.join(label_path.split('/')[-2:])
Loading

0 comments on commit 29cbeb8

Please sign in to comment.