Add evaluation scripts for segmentation approaches

karstenBehrendt · Jul 4, 2019 · 29cbeb8 · 29cbeb8
1 parent 75e856a
commit 29cbeb8
Show file tree

Hide file tree

Showing 7 changed files with 373 additions and 6 deletions.
diff --git a/evaluation/__init__.py b/evaluation/__init__.py
diff --git a/evaluation/evaluate_segmentation.py b/evaluation/evaluate_segmentation.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+""" The evaluation script for the segmentation part of the unsupervised
+llamas dataset.
+
+It calculates AUC, and best precision-recall combinations for each class.
+
+The script expects all images to be named according to the label files, i.e.,
+recording_folder/label_file.json + '_' + {class integer} + '.png'
+
+The class integers / enums are:
+    0: background
+    1: l1
+    2: l0
+    3: r0
+    4: r1
+In the binary case 1 is enough for the evaluation.
+
+An example image path for r0 (first marker to the right) is:
+/PATH_TO_FOLDER/llamas/trained_nets/2019_03_03__17_53_39_multi_marker_net_gradients/
+markers-1456725_test/images-2014-12-22-13-22-35_mapping_280S_2nd_lane/
+1419283521_0744236000.json_3.png
+
+Use png files for lossless compression.
+Files are stored for individual channels because it's easy. Four channel images
+would not be an issue but after that it may not be too straightforward.
+
+Make sure to scale predictions from 0 to 255 when storing as image.
+cv2.imwrite may write zeros and ones only for a given float as dtype with values
+between 0 and one, even though cv2.imshow visualizes it correctly.
+
+Usage:
+    python3 evaluate_segmentation.py \
+        --inference_folder folder_with_stored_inference_images
+        --multi_class (optional if it is not binary)
+"""
+# TODO Needs to be tested and needs docstrings
+# TODO The binary and multi_class evaluation can probably be combined
+#      by just checking which files exist
+# TODO The multithreading call can be implemented in a cleaner way
+
+import argparse
+import concurrent.futures
+import os
+
+import cv2
+import tqdm
+
+from unsupervised_llamas.evaluation import segmentation_metrics
+from unsupervised_llamas.label_scripts import dataset_constants
+from unsupervised_llamas.label_scripts import helper_scripts
+from unsupervised_llamas.label_scripts import segmentation_labels
+
+
+def binary_eval_single_image(inputs):
+    # Single argument call for the threaded function.
+    # This can probably be implemented in a cleaner way.
+    return single_threaded_binary_eval_single_image(inputs[0], inputs[1])
+
+
+def multi_eval_single_image(inputs):
+    # Single argument call for the threaded function.
+    # This can probably be implemented in a cleaner way.
+    return single_threaded_multi_eval_single_image(inputs[0], inputs[1])
+
+
+def single_threaded_multi_eval_single_image(label_path, segmentation_folder):
+    target = segmentation_labels.create_multi_class_segmentation_label(label_path)
+
+    results = {}
+    for i in range(5):
+        # TODO Needs to be adapted for more cases farther lanes
+        # Currently (in order) background, l1, l0, r0, r1
+        segmentation_path = os.path.join(
+            segmentation_folder,
+            helper_scripts.get_label_base(label_path)) + '_{}.png'.format(i)
+
+        segmentation = cv2.imread(segmentation_path, cv2.IMREAD_GRAYSCALE).astype(float) / 255
+        results[i] = segmentation_metrics.binary_approx_auc(segmentation, target[:, :, i])
+
+    return results
+
+
+def single_threaded_binary_eval_single_image(label_path, segmentation_folder):
+    target = segmentation_labels.create_binary_segmentation_label(label_path)
+
+    segmentation_path = os.path.join(
+        segmentation_folder,
+        helper_scripts.get_label_base(label_path)) + '_1.png'
+    segmentation = cv2.imread(segmentation_path, cv2.IMREAD_GRAYSCALE).astype(float) / 255
+
+    results = segmentation_metrics.binary_approx_auc(segmentation, target)
+    return results
+
+
+def evaluate_set(segmentation_folder, eval_function, dataset_split='test', max_workers=8):
+    """ Runs evaluation for a given image folder
+
+    Parameters
+    ----------
+    segmentation_folder : str
+                          folder with predictions / inference images according to docstring
+    eval_function : function
+                    Currently the binary or multi-class evaluation function
+    dataset_split : str
+                    'train', 'valid', or 'test'. Calculates metrics for that split.
+    max_workers : int
+                  Number of threads to use
+
+    Returns
+    -------
+    Dictionary with AP for each class and best precision-recall combination
+
+    Raises
+    ------
+    IOError if inference image does not exist for a sample in the defined split
+
+    Notes
+    -----
+    Use max_workers=1 for single threaded call. This makes debugging a lot easier.
+    """
+    label_folder = os.path.join(dataset_constants.LABELS, dataset_split)
+    if not os.path.isdir(label_folder):
+        raise IOError('Could not find labels for split {} at {}'.format(
+            dataset_split, label_folder))
+    label_paths = helper_scripts.get_labels(dataset_split)
+
+    if not os.path.isdir(segmentation_folder):
+        raise IOError('Could not find segmentation folder at', segmentation_folder)
+
+    # This still takes a couple of hours.
+    eval_dicts = {}
+    if max_workers > 1:
+        with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
+            for label_path, single_eval in tqdm.tqdm(
+                    zip(label_paths, executor.map(
+                        eval_function, zip(label_paths, [segmentation_folder] * len(label_paths)))),
+                    desc='Scoring test samples', total=len(label_paths)):
+                eval_dicts[label_path] = single_eval
+    else:  # mainly for debugging
+        for label_path in tqdm.tqdm(
+                label_paths, desc='Scoring test samples', total=len(label_paths)):
+            eval_dicts[label_path] = eval_function((label_path, segmentation_folder))
+
+    # The reduce step. Calculates averages
+    eval_keys = list(list(eval_dicts.values())[0].keys())
+    averaged_results = {key: 0 for key in eval_keys}
+    for eval_dict in eval_dicts.values():
+        for key, value in eval_dict.items():
+            averaged_results[key] += value
+    for key in eval_keys:
+        averaged_results[key] /= len(label_paths)
+
+    print(segmentation_folder, '\n', averaged_results)
+    return averaged_results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--inference_folder', type=str, required=True,
+                        help='Folder of inference images, see docstring')
+    parser.add_argument('--multi_class', action='store_true')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    eval_function = multi_eval_single_image if args.multi_class else binary_eval_single_image
+    evaluate_set(args.inference_folder, eval_function)
diff --git a/evaluation/fix_inference_output_names.py b/evaluation/fix_inference_output_names.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+"""
+A quick script to adapt to the naming schema of the evaluation scripts.
+Not needed if files are named according to the evaluation scripts.
+"""
+import argparse
+import os
+
+import tqdm
+
+from unsupervised_llamas.label_scripts import helper_scripts
+
+
+def fix_names(input_folder, input_string, output_string):
+    """ Changes all names within folder according to parameters
+
+    Parameters
+    ----------
+    input_folder : str
+                   folder containing inference images
+    input_string : str
+                   substring to be replace within each image
+    output_string : str
+                    what the input_string should be
+
+    Notes
+    -----
+    This function is only needed if the scripts don't follow the
+    expected naming conventions in the first place.
+    """
+    segmentation_images = helper_scripts.get_files_from_folder(input_folder, '.png')
+    for segmentation_image in tqdm.tqdm(segmentation_images, desc='renaming images'):
+        output_path = segmentation_image.replace(input_string, output_string)
+        os.rename(segmentation_image, output_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--input_folder', type=str, required=True)
+    parser.add_argument('--input_string', type=str, required=True)
+    parser.add_argument('--output_string', type=str, required=True)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    fix_names(args.input_folder, args.input_string, args.output_string)
diff --git a/evaluation/segmentation_metrics.py b/evaluation/segmentation_metrics.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""
+Calculates
+  true positives (tp)
+  false positives (fp)
+  true negatives (tn)
+  false negatives (fn)
+  precision
+  recall
+  average precision / AUC / PR curves
+
+Additional metrics are welcome
+One problem with lane marker segmentation is that the absolute number of correctly
+classified pixels often is not helpful because background pixels far outweigh
+the lane marker pixels. In absolute terms, marking all pixels as background likely
+is the best solution but not helpful for the problem at hand.
+
+Notes
+-----
+Don't use Python2. There may be integer divisions that I missed.
+
+Options for calculating AUC / Precision Recall curve
+1)
+It may be faster to sort (prediction, label) pixels by probability and
+go through those. O(n log n) in the amount of pixels per image.
+Sorting takes about .36 seconds on my current system.
+Expected speedup should be about 50%
+
+2)
+Bucket sort is possible as well. O(n) to put probabilities into k buckets.
+o(n) to calculate the poc / auc. May be faster than using sort().
+Sort however may be implemented in C. Still an approximation, as 3).
+
+3) * current implementation. It was easy and can be replaced any time.
+O(k * n), k being the amount of threshold steps,
+which is not as accurate but may leverage the c/c++ numpy backend.
+tp/tn/fp/fn take about one second to calculate
+"""
+# NOTE There should be tests
+
+import numpy
+
+
+def _debug_view(prediction, label):
+    """ Shows prediction and label for visual debugging """
+    prediction = (prediction * 255).astype(numpy.uint8)
+    label = (label * 255).astype(numpy.uint8)
+    c = numpy.zeros((717, 1276), dtype=numpy.uint8)
+
+    debug_image = numpy.stack((prediction, label, c), axis=-1)
+    import cv2   # Not forcing cv2 dependency for metrics
+    cv2.imshow('debug_image', debug_image)
+    cv2.waitKey(1000)
+
+
+def thresholded_binary(prediction, threshold):
+    """ Thresholds prediction to 0 and 1 according to threshold """
+    return (prediction >= threshold).astype(int)
+
+
+def true_positive(prediction, label):
+    """ Calculates number of correctly classified foreground pixels """
+    num_tp = numpy.sum(numpy.logical_and(label != 0, prediction == label))
+    return num_tp
+
+
+def false_positive(prediction, label):
+    """ Calculates number of incorrectly predicted foreground pixels """
+    num_fp = numpy.sum(numpy.logical_and(label == 0, prediction != 0))
+    return num_fp
+
+
+def true_negative(prediction, label):
+    """ Calculates number of correctly identified background pixels """
+    num_tn = numpy.sum(numpy.logical_and(label == 0, prediction == label))
+    return num_tn
+
+
+def false_negative(prediction, label):
+    """ Calculates number of missed foreground pixels """
+    num_fn = numpy.sum(numpy.logical_and(label != 0, prediction == 0))
+    return num_fn
+
+
+def binary_approx_auc(prediction, label):
+    """ Calculates approximated auc and best precision-recall combination
+
+    Parameters
+    ----------
+    prediction : numpy.ndarray
+                 raw prediction output in [0, 1]
+    label : numpy.ndarray
+            target / label, values are either 0 or 1
+
+    Returns
+    -------
+    Dict of approximate AUC, "corner" precision, "corner" recall
+    {'precision', 'recall', 'auc'}
+
+    Notes
+    -----
+    See docstring for alternative implementation options
+    Approximated by 100 uniform thresholds between 0 and 1
+    """
+    # NOTE May achieve speedup by checking if label is all zeros
+    num_steps = 100
+    auc_value = 0
+
+    # Most upper right precision, recall point
+    corner_precision = 0
+    corner_recall = 0
+    corner_auc = 0
+
+    precisions = [1]
+    recalls = [0]
+
+    # Individual precision recall evaluation for those steps
+    for i in range(num_steps + 1):
+        threshold = (num_steps - i) / num_steps
+        thresholded_prediction = thresholded_binary(prediction, threshold)
+
+        # tn = true_negative(thresholded_prediction, label)
+        tp = true_positive(thresholded_prediction, label)
+        fn = false_negative(thresholded_prediction, label)
+        fp = false_positive(thresholded_prediction, label)
+
+        precision = 0 if (tp + fp) == 0 else tp / (tp + fp)
+        recall = 0 if (tp + fn) == 0 else tp / (tp + fn)
+
+        if (precision * recall) > corner_auc:
+            corner_auc = precision * recall
+            corner_precision = precision
+            corner_recall = recall
+
+        precisions.append(precision)
+        recalls.append(recall)
+
+        auc_value += (recalls[-1] - recalls[-2]) * precisions[-2]
+
+    return {'recall': corner_recall, 'precision': corner_precision, 'auc': auc_value}
diff --git a/label_scripts/dataset_constants.py b/label_scripts/dataset_constants.py
@@ -25,11 +25,11 @@
 # Multi-class segmentation colors for the individual lanes
 # The names are based on the camera location, e.g. the markers
 # from r2 divide the first lane to the right from the second to the right
-DCOLORS = [(75, 25, 230), (75, 180, 60), (200, 130, 0), (48, 130, 245), (180, 30, 145),
+DCOLORS = [(110, 30, 30), (75, 25, 230), (75, 180, 60), (200, 130, 0), (48, 130, 245), (180, 30, 145),
            (0, 0, 255), (24, 140, 34), (255, 0, 0), (0, 255, 255),  # the main ones
            (40, 110, 170), (200, 250, 255), (255, 190, 230), (0, 0, 128), (195, 255, 170),
            (0, 128, 128), (195, 255, 170), (75, 25, 230)]
-LANE_NAMES = ['l6', 'l5', 'l4', 'l3', 'l2',
+LANE_NAMES = ['l7', 'l6', 'l5', 'l4', 'l3', 'l2',
               'l1', 'l0', 'r0', 'r1',
               'r2', 'r3', 'r4', 'r5',
               'r6', 'r7', 'r8']

diff --git a/label_scripts/helper_scripts.py b/label_scripts/helper_scripts.py
@@ -3,6 +3,14 @@
 """
 
 import os
+from unsupervised_llamas.label_scripts import dataset_constants
+
+
+def get_labels(split='test'):
+    """ Gets label files of specified dataset split """
+    label_paths = get_files_from_folder(
+        os.path.join(dataset_constants.LABELS, split), '.json')
+    return label_paths
 
 
 def get_files_from_folder(directory, extension=None):
@@ -15,3 +23,8 @@ def get_files_from_folder(directory, extension=None):
     if extension is not None:
         label_files = list(filter(lambda x: x.endswith(extension), label_files))
     return label_files
+
+
+def get_label_base(label_path):
+    """ Gets directory independent label path """
+    return '/'.join(label_path.split('/')[-2:])