diff --git a/docs/model_zoo/segmentation.rst b/docs/model_zoo/segmentation.rst
index d69a98cea8..734a63c3f3 100644
--- a/docs/model_zoo/segmentation.rst
+++ b/docs/model_zoo/segmentation.rst
@@ -31,7 +31,7 @@ Table of pre-trained models for semantic segmentation and their performance.
   The test script :download:`Download test.py<../../scripts/segmentation/test.py>` can be used for
   evaluating the models (VOC results are evaluated using the official server). For example ``fcn_resnet50_ade``::
 
-    python test.py --dataset ade20k --model-zoo fcn_resnet50_ade --eval
+    python test.py --dataset ade20k --pretrained --model fcn --backbone resnet50 --eval
 
   The training commands work with the script: :download:`Download train.py<../../scripts/segmentation/train.py>`
 
diff --git a/gluoncv/utils/metrics/segmentation.py b/gluoncv/utils/metrics/segmentation.py
index 310379942e..a0d3f70579 100644
--- a/gluoncv/utils/metrics/segmentation.py
+++ b/gluoncv/utils/metrics/segmentation.py
@@ -7,15 +7,19 @@
 __all__ = ['SegmentationMetric', 'batch_pix_accuracy', 'batch_intersection_union',
            'pixelAccuracy', 'intersectionAndUnion']
 
+
 class SegmentationMetric(EvalMetric):
     """Computes pixAcc and mIoU metric scores
     """
+
+
     def __init__(self, nclass):
         super(SegmentationMetric, self).__init__('pixAcc & mIoU')
         self.nclass = nclass
         self.lock = threading.Lock()
         self.reset()
 
+
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -27,6 +31,8 @@ def update(self, labels, preds):
         preds : 'NDArray' or list of `NDArray`
             Predicted values.
         """
+
+
         def evaluate_worker(self, label, pred):
             correct, labeled = batch_pix_accuracy(
                 pred, label)
@@ -38,18 +44,20 @@ def evaluate_worker(self, label, pred):
                 self.total_inter += inter
                 self.total_union += union
 
+
         if isinstance(preds, mx.nd.NDArray):
             evaluate_worker(self, labels, preds)
         elif isinstance(preds, (list, tuple)):
             threads = [threading.Thread(target=evaluate_worker,
                                         args=(self, label, pred),
-                                       )
+                                        )
                        for (label, pred) in zip(labels, preds)]
             for thread in threads:
                 thread.start()
             for thread in threads:
                 thread.join()
 
+
     def get(self):
         """Gets the current evaluation result.
 
@@ -63,6 +71,7 @@ def get(self):
         mIoU = IoU.mean()
         return pixAcc, mIoU
 
+
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.total_inter = 0
@@ -70,6 +79,7 @@ def reset(self):
         self.total_correct = 0
         self.total_label = 0
 
+
 def batch_pix_accuracy(output, target):
     """PixAcc"""
     # inputs are NDarray, output 4D, target 3D
@@ -79,7 +89,7 @@ def batch_pix_accuracy(output, target):
     target = target.asnumpy().astype('int64') + 1
 
     pixel_labeled = np.sum(target > 0)
-    pixel_correct = np.sum((predict == target)*(target > 0))
+    pixel_correct = np.sum((predict == target) * (target > 0))
 
     assert pixel_correct <= pixel_labeled, "Correct area should be smaller than Labeled"
     return pixel_correct, pixel_labeled
@@ -119,7 +129,7 @@ def pixelAccuracy(imPred, imLab):
     # Remove classes from unlabeled pixels in gt image.
     # We should not penalize detections in unlabeled portions of the image.
     pixel_labeled = np.sum(imLab > 0)
-    pixel_correct = np.sum((imPred == imLab)*(imLab > 0))
+    pixel_correct = np.sum((imPred == imLab) * (imLab > 0))
     pixel_accuracy = 1.0 * pixel_correct / pixel_labeled
     return (pixel_accuracy, pixel_correct, pixel_labeled)
 
diff --git a/scripts/segmentation/test.py b/scripts/segmentation/test.py
index c271c0b152..14081440bd 100644
--- a/scripts/segmentation/test.py
+++ b/scripts/segmentation/test.py
@@ -120,7 +120,7 @@ def test(model, args, input_transform):
     tbar = tqdm(test_data)
     for i, (data, dsts) in enumerate(tbar):
         if args.eval:
-            predicts = [pred[0] for pred in evaluator.parallel_forward(data)]
+            predicts = [pred for pred in evaluator.parallel_forward(data)]
             targets = [target.as_in_context(predicts[0].context) \
                        for target in dsts]
             metric.update(targets, predicts)