gemma 3 for bounding-box segmentation

Contents

gemma 3 for bounding-box segmentation#

In this notebook we will use the vision language model gemma3 to test if it supports drawing bounding boxes around objects.

import openai
from skimage.io import imread
import stackview
from image_utilities import extract_json
from prompt_utilities import prompt_openai

import json
import os
import pandas as pd
from skimage.io import imsave

Bounding box segmentation#

We first load an example dataset, a crop of the human_mitosis image from scikit-image.

import stackview
from skimage import data
import numpy as np

# Load the human mitosis dataset
image = data.human_mitosis()[:100, :100]

stackview.insight(image)

shape	(100, 100)
dtype	uint8
size	9.8 kB
min	7
max	88

model = "gemma3:12b"

reply = prompt_openai("""
Give me a json object of bounding boxes around ALL bright blobs in this image. Assume the image width and height are 1. 
The bottom left is position (0,0), top left is (0,1), top right is (1,1) and bottom right is (1,0).
The format should be like this: 

```json
[
    {"x":float,"y":float, "width": float, "height": float},
    {"x":float,"y":float, "width": float, "height": float},
    ...
]
```

If you think you can't do this accuratly, please try anyway.
""", image, model=model, base_url="http://localhost:11434/v1", api_key=None)
print(reply)

```json
[
    {"x":0.11, "y":0.16, "width": 0.08, "height": 0.06},
    {"x":0.31, "y":0.26, "width": 0.07, "height": 0.06},
    {"x":0.48, "y":0.10, "width": 0.08, "height": 0.07},
    {"x":0.18, "y":0.64, "width": 0.07, "height": 0.07},
    {"x":0.67, "y":0.29, "width": 0.05, "height": 0.06},
    {"x":0.80, "y":0.47, "width": 0.07, "height": 0.05},
    {"x":0.37, "y":0.85, "width": 0.09, "height": 0.08},
    {"x":0.60, "y":0.72, "width": 0.05, "height": 0.07},
    {"x":0.02, "y":0.36, "width": 0.06, "height": 0.06},
    {"x":0.77, "y":0.85, "width": 0.07, "height": 0.08},
    {"x":0.42, "y":0.48, "width": 0.07, "height": 0.07},
    {"x":0.85, "y":0.14, "width": 0.07, "height": 0.06},
    {"x":0.24, "y":0.08, "width": 0.06, "height": 0.05},
    {"x":0.58, "y":0.55, "width": 0.06, "height": 0.07}
]
```

bb = json.loads(extract_json(reply))

bb

[{'x': 0.11, 'y': 0.16, 'width': 0.08, 'height': 0.06},
 {'x': 0.31, 'y': 0.26, 'width': 0.07, 'height': 0.06},
 {'x': 0.48, 'y': 0.1, 'width': 0.08, 'height': 0.07},
 {'x': 0.18, 'y': 0.64, 'width': 0.07, 'height': 0.07},
 {'x': 0.67, 'y': 0.29, 'width': 0.05, 'height': 0.06},
 {'x': 0.8, 'y': 0.47, 'width': 0.07, 'height': 0.05},
 {'x': 0.37, 'y': 0.85, 'width': 0.09, 'height': 0.08},
 {'x': 0.6, 'y': 0.72, 'width': 0.05, 'height': 0.07},
 {'x': 0.02, 'y': 0.36, 'width': 0.06, 'height': 0.06},
 {'x': 0.77, 'y': 0.85, 'width': 0.07, 'height': 0.08},
 {'x': 0.42, 'y': 0.48, 'width': 0.07, 'height': 0.07},
 {'x': 0.85, 'y': 0.14, 'width': 0.07, 'height': 0.06},
 {'x': 0.24, 'y': 0.08, 'width': 0.06, 'height': 0.05},
 {'x': 0.58, 'y': 0.55, 'width': 0.06, 'height': 0.07}]

This correction step seems necessary because the model doesn’t understand the coordinate system as we do.

for b in bb:
    b['t'] = b['x']
    b['x'] = b['y']
    b['y'] = b['t']
bb

[{'x': 0.16, 'y': 0.11, 'width': 0.08, 'height': 0.06, 't': 0.11},
 {'x': 0.26, 'y': 0.31, 'width': 0.07, 'height': 0.06, 't': 0.31},
 {'x': 0.1, 'y': 0.48, 'width': 0.08, 'height': 0.07, 't': 0.48},
 {'x': 0.64, 'y': 0.18, 'width': 0.07, 'height': 0.07, 't': 0.18},
 {'x': 0.29, 'y': 0.67, 'width': 0.05, 'height': 0.06, 't': 0.67},
 {'x': 0.47, 'y': 0.8, 'width': 0.07, 'height': 0.05, 't': 0.8},
 {'x': 0.85, 'y': 0.37, 'width': 0.09, 'height': 0.08, 't': 0.37},
 {'x': 0.72, 'y': 0.6, 'width': 0.05, 'height': 0.07, 't': 0.6},
 {'x': 0.36, 'y': 0.02, 'width': 0.06, 'height': 0.06, 't': 0.02},
 {'x': 0.85, 'y': 0.77, 'width': 0.07, 'height': 0.08, 't': 0.77},
 {'x': 0.48, 'y': 0.42, 'width': 0.07, 'height': 0.07, 't': 0.42},
 {'x': 0.14, 'y': 0.85, 'width': 0.07, 'height': 0.06, 't': 0.85},
 {'x': 0.08, 'y': 0.24, 'width': 0.06, 'height': 0.05, 't': 0.24},
 {'x': 0.55, 'y': 0.58, 'width': 0.06, 'height': 0.07, 't': 0.58}]

new_image = stackview.add_bounding_boxes(image, bb)

new_image

shape	(100, 100, 3)
dtype	uint8
size	29.3 kB
min	0
max	255