This is a focused how-to article, I assume you already know what YOLO is and have a basic Golang knowledge.

Notes:

the inference example is based on the yalue examples
the code used in this article is adapted to the MacOS with the ARM architecture, for other OSes and architecures, you will need libs available here and an example on how to use them is located here

Step 1: Convert YOLO to ONNX

First we need to convert the YOLOv8 model to the ONNX format. To do this we’ll install the ultralytics package with pip and use the yolo export command.

The image size here - 640 - is important, it has to be the same size that we’ll use later in our code.

mkdir yolov8 && cd yolov8

# Create and enable virtual env.
python3.10 -m venv env
source env/bin/activate

pip install ultralytics

yolo export model=yolov8n.pt format=onnx imgsz=640

# Quit virtual env.
deactivate

The ouput should be similar to:

(...)
ONNX: starting export with onnx 1.17.0 opset 17...
ONNX: slimming with onnxslim 0.1.61...
ONNX: export success ✅ 19.8s, saved as 'yolov8n.onnx' (12.2 MB)

Export complete (20.5s)
Results saved to /(...)/yolov8
Predict:         yolo predict task=detect model=yolov8n.onnx imgsz=640  
Validate:        yolo val task=detect model=yolov8n.onnx imgsz=640 data=coco.yaml  
Visualize:       https://netron.app
💡 Learn more at https://docs.ultralytics.com/modes/export

This will create a yolov8n.onnx file with the onnx YOLOv8 model. Please be sure to copy the model to your directory.

I’ve already included a converted model file in the article full code example.

Step 2: Load Image File

pic, e := loadImageFile(imagePath)
if e != nil {
    fmt.Printf("error loading input image: %s\n", e)

    return 1
}

(...)

func loadImageFile(filePath string) (image.Image, error) {
    f, e := os.Open(filePath)

    if e != nil {
        return nil, fmt.Errorf("error opening %s: %w", filePath, e)
    }
    defer func(f *os.File) {
        err := f.Close()
        if err != nil {
            fmt.Printf("error closing %s: %v\n", filePath, err)
        }
    }(f)

    pic, _, e := image.Decode(f)
    if e != nil {
        return nil, fmt.Errorf("error decoding %s: %w", filePath, e)
    }

    return pic, nil
}

Step 3: Init ONNX Session

modelSession, e := initSession()
if e != nil {
    fmt.Printf("Error creating session and tensors: %s\n", e)

    return 1
}
defer modelSession.Destroy()

(...)

func initSession() (*ModelSession, error) {
    ort.SetSharedLibraryPath(sharedLibPath)

    err := ort.InitializeEnvironment()
    if err != nil {
        return nil, fmt.Errorf("error initializing ORT environment: %w", err)
    }

    inputShape := ort.NewShape(1, 3, 640, 640)

    inputTensor, err := ort.NewEmptyTensor[float32](inputShape)
    if err != nil {
        return nil, fmt.Errorf("error creating input tensor: %w", err)
    }

    outputShape := ort.NewShape(1, 84, 8400)

    outputTensor, err := ort.NewEmptyTensor[float32](outputShape)
    if err != nil {
        inputTensor.Destroy()
        return nil, fmt.Errorf("error creating output tensor: %w", err)
    }

    options, err := ort.NewSessionOptions()
    if err != nil {
        inputTensor.Destroy()
        outputTensor.Destroy()
        return nil, fmt.Errorf("error creating ORT session options: %w", err)
    }
    defer options.Destroy()

    session, err := ort.NewAdvancedSession(modelPath,
        []string{"images"}, []string{"output0"},
        []ort.ArbitraryTensor{inputTensor},
        []ort.ArbitraryTensor{outputTensor},
        options)
    if err != nil {
        inputTensor.Destroy()
        outputTensor.Destroy()
        return nil, fmt.Errorf("error creating ORT session: %w", err)
    }

    return &ModelSession{
        Session: session,
        Input:   inputTensor,
        Output:  outputTensor,
    }, nil
}

Step 4: Prepare Input

This is where we need to use the data from the image and fill the YOLO input tensor with it:

e = prepareInput(pic, modelSession.Input)
if e != nil {
    fmt.Printf("Error converting image to network input: %s\n", e)

    return 1
}

(...)

// Populates a YOLOv8n input tensor with the contents of the given image.
func prepareInput(pic image.Image, dst *ort.Tensor[float32]) error {
    data := dst.GetData()
    channelSize := 640 * 640
    if len(data) < (channelSize * 3) {
        return fmt.Errorf("destination tensor only holds %d floats, needs %d (make sure it's the right shape!)", len(data), channelSize*3)
    }
    redChannel := data[0:channelSize]
    greenChannel := data[channelSize : channelSize*2]
    blueChannel := data[channelSize*2 : channelSize*3]

    // Resize the image to 640x640 using Lanczos3 algorithm
    pic = resize.Resize(640, 640, pic, resize.Lanczos3)
    i := 0
    for y := 0; y < 640; y++ {
        for x := 0; x < 640; x++ {
            r, g, b, _ := pic.At(x, y).RGBA()
            redChannel[i] = float32(r>>8) / 255.0
            greenChannel[i] = float32(g>>8) / 255.0
            blueChannel[i] = float32(b>>8) / 255.0
            i++
        }
    }

    return nil
}

Step 5: Run Session

Run the inference:

e = modelSession.Session.Run()
if e != nil {
    fmt.Printf("Error running ORT session: %s\n", e)

    return 1
}

Step 6: Process Output

Now we need to process the inference results and prepare an output in a human readable format:

boxes := processOutput(modelSession.Output.GetData(), originalWidth,
        originalHeight)
for i, box := range boxes {
    fmt.Printf("Box %d: %s\n", i, &box)
}

(...)

func processOutput(output []float32, originalWidth,
    originalHeight int) []boundingBox {
    boundingBoxes := make([]boundingBox, 0, 8400)

    var classID int
    var probability float32

    // Iterate through the output array, considering 8400 indices
    for idx := 0; idx < 8400; idx++ {
        // Iterate through 80 classes and find the class with the highest probability
        probability = -1e9
        for col := 0; col < 80; col++ {
            currentProb := output[8400*(col+4)+idx]
            if currentProb > probability {
                probability = currentProb
                classID = col
            }
        }

        // If the probability is less than 0.5, continue to the next index
        if probability < 0.5 {
            continue
        }

        // Extract the coordinates and dimensions of the bounding box
        xc, yc := output[idx], output[8400+idx]
        w, h := output[2*8400+idx], output[3*8400+idx]
        x1 := (xc - w/2) / 640 * float32(originalWidth)
        y1 := (yc - h/2) / 640 * float32(originalHeight)
        x2 := (xc + w/2) / 640 * float32(originalWidth)
        y2 := (yc + h/2) / 640 * float32(originalHeight)

        // Append the bounding box to the result
        boundingBoxes = append(boundingBoxes, boundingBox{
            label:      yoloClasses[classID],
            confidence: probability,
            x1:         x1,
            y1:         y1,
            x2:         x2,
            y2:         y2,
        })
    }

    // Sort the bounding boxes by probability
    sort.Slice(boundingBoxes, func(i, j int) bool {
        return boundingBoxes[i].confidence < boundingBoxes[j].confidence
    })

    // Define a slice to hold the final result
    mergedResults := make([]boundingBox, 0, len(boundingBoxes))

    // Iterate through sorted bounding boxes, removing overlaps
    for _, candidateBox := range boundingBoxes {
        overlapsExistingBox := false
        for _, existingBox := range mergedResults {
            if (&candidateBox).iou(&existingBox) > 0.7 {
                overlapsExistingBox = true
                break
            }
        }
        if !overlapsExistingBox {
            mergedResults = append(mergedResults, candidateBox)
        }
    }

    // This will still be in sorted order by confidence
    return mergedResults
}

It will produce something similar:

go run main.go
Box 0: Object laptop (confidence 0.524439): (213.599579, 243.196198), (419.911469, 350.581512)
Box 1: Object cup (confidence 0.563491): (433.477356, 257.403839), (571.929077, 355.463074)
Box 2: Object parking meter (confidence 0.578624): (406.172058, 50.842918), (565.424744, 231.428116)

The inference on the Apple M1 Pro (2020) takes about 10 seconds.

Step 7: Draw Output Image with Boxes

In this step we create an output image based on the input image, but with the boxes marking detected objects.

Note:

for labels you need to provide the correct path to the font you want to use (see fontPath const at the top of the program)

const (
    outputImagePath = "./output.jpg"
(...)
    fontPath        = "/Library/Fonts/Arial Unicode.ttf"
)

(...)

err := drawBoxes(imagePath, outputImagePath, boxes)
if err != nil {
    fmt.Printf("error drawing boxes: %s\n", err)

    return 1
}

(...)

// Draws bounding boxes with labels onto the image and saves the result
func drawBoxes(inputPath string, outputPath string, boxes []boundingBox) error {
    // Open and decode the image
    f, err := os.Open(inputPath)
    if err != nil {
        return fmt.Errorf("error opening input image: %w", err)
    }
    defer f.Close()

    img, _, err := image.Decode(f)
    if err != nil {
        return fmt.Errorf("error decoding image: %w", err)
    }

    dc := gg.NewContextForImage(img)
    dc.SetLineWidth(1)
    fontLoaded := false
    if err := dc.LoadFontFace(fontPath, 14); err == nil {
        fontLoaded = true
    }

    for _, box := range boxes {
        // Draw rectangle
        dc.SetRGB(1, 0, 0) // red
        dc.DrawRectangle(float64(box.x1), float64(box.y1), float64(box.x2-box.x1), float64(box.y2-box.y1))
        dc.Stroke()

        // Draw label
        if fontLoaded {
            label := fmt.Sprintf("%s (%.2f)", box.label, box.confidence)
            dc.SetRGB(0, 0, 1)
            dc.DrawStringAnchored(label, float64(box.x1)+4, float64(box.y1)-4, 0, 1)
        }
    }

    // Save the result
    out, err := os.Create(outputPath)
    if err != nil {
        return fmt.Errorf("error creating output file: %w", err)
    }
    defer out.Close()

    return jpeg.Encode(out, dc.Image(), &jpeg.Options{Quality: 90})
}