{
"cells": [
{
"cell_type": "markdown",
"id": "1c7a3958-a52b-4446-8506-94f0c98863f6",
"metadata": {},
"source": [
"# Moondream for bounding-box segmentation\n",
"\n",
"In this notebook we will use the vision language model [moondream](https://huggingface.co/vikhyatk/moondream2) to determine bounding-boxes around objects."
]
},
{
"cell_type": "markdown",
"id": "38b84c3f-7a1d-4324-af3b-bb3be13f119f",
"metadata": {},
"source": [
"Installation (Windows):\n",
"* Download vips-dev-w64-all-8.16.1.zip from [here](https://github.com/libvips/build-win64-mxe/releases/tag/v8.16.1), unzip it, and add its subfolder `bin` to the PATH environment variable.\n",
"* `pip install einops pyvips`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ab4408ba-b33f-46d0-89cb-59133378f1ae",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"from PIL import Image\n",
"from image_utilities import numpy_to_bytestream, extract_json, generate_spots\n",
"from tqdm import tqdm\n",
"import stackview\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" \"vikhyatk/moondream2\",\n",
" revision=\"2025-04-14\",\n",
" trust_remote_code=True,\n",
" # Comment to run on CPU. To use the GPU, you need about 5 GB of GPU Memory.\n",
" device_map={\"\": \"cuda\"}\n",
")"
]
},
{
"cell_type": "markdown",
"id": "18c7adae-d56b-452b-bba9-aed363bc831d",
"metadata": {},
"source": [
"## Human mitosis\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "22263f34-6be6-4a8f-9bbb-a5e138dacee4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"\n",
" \n",
" | \n",
"\n",
"\n",
"\n",
"shape | (100, 100) | \n",
"dtype | uint8 | \n",
"size | 9.8 kB | \n",
"min | 7 | max | 88 | \n",
" \n",
" \n",
" | \n",
"
\n",
"
"
],
"text/plain": [
"[[ 8 8 8 ... 10 9 9]\n",
" [ 8 8 7 ... 10 11 10]\n",
" [ 9 8 8 ... 9 10 9]\n",
" ...\n",
" [ 9 8 9 ... 9 9 8]\n",
" [ 9 8 8 ... 9 9 9]\n",
" [ 8 8 9 ... 10 9 9]]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import stackview\n",
"from skimage import data\n",
"import numpy as np\n",
"\n",
"# Load the human mitosis dataset\n",
"image = data.human_mitosis()[:100, :100]\n",
"\n",
"# Display the image\n",
"stackview.insight(image)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2479e515-c803-4577-a9b0-ef2a2b54a8d2",
"metadata": {},
"outputs": [],
"source": [
"pil_image = Image.fromarray(image)\n",
"\n",
"encoded_image = model.encode_image(pil_image)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "dee1257a-b0f0-41b9-933f-cc40af031b47",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 11 bright spot(s)\n"
]
}
],
"source": [
"bb = model.detect(encoded_image, \"Mark all the bright blobs individually\")[\"objects\"]\n",
"print(f\"Found {len(bb)} bright spot(s)\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "46a9196b-3f4a-4863-9a3f-34432b57335f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'x_min': 0.2584344260394573,\n",
" 'y_min': 0.0017580389976501465,\n",
" 'x_max': 0.3743780739605427,\n",
" 'y_max': 0.09199196100234985,\n",
" 'x': 0.2584344260394573,\n",
" 'y': 0.0017580389976501465,\n",
" 'width': 0.11594364792108536,\n",
" 'height': 0.09023392200469971},\n",
" {'x_min': 0.42661894857883453,\n",
" 'y_min': 0.0029355064034461975,\n",
" 'x_max': 0.5265060514211655,\n",
" 'y_max': 0.1220644935965538,\n",
" 'x': 0.42661894857883453,\n",
" 'y': 0.0029355064034461975,\n",
" 'width': 0.09988710284233093,\n",
" 'height': 0.1191289871931076}]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for b in bb:\n",
" b[\"x\"] = b[\"x_min\"]\n",
" b[\"y\"] = b[\"y_min\"]\n",
" b[\"width\"] = b[\"x_max\"]-b[\"x_min\"]\n",
" b[\"height\"] = b[\"y_max\"]-b[\"y_min\"]\n",
"bb[:2]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "602ecc12-4c6f-4b36-8542-a04ba10df765",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
" \n",
" | \n",
"\n",
"\n",
"\n",
"shape | (100, 100, 3) | \n",
"dtype | uint8 | \n",
"size | 29.3 kB | \n",
"min | 0 | max | 255 | \n",
" \n",
" \n",
" | \n",
"
\n",
"
"
],
"text/plain": [
"[[[ 3 3 3]\n",
" [ 3 3 3]\n",
" [ 3 3 3]\n",
" ...\n",
" [ 9 9 9]\n",
" [ 6 6 6]\n",
" [ 6 6 6]]\n",
"\n",
" [[ 3 3 3]\n",
" [ 3 3 3]\n",
" [ 0 0 0]\n",
" ...\n",
" [ 9 9 9]\n",
" [12 12 12]\n",
" [ 9 9 9]]\n",
"\n",
" [[ 6 6 6]\n",
" [ 3 3 3]\n",
" [ 3 3 3]\n",
" ...\n",
" [ 6 6 6]\n",
" [ 9 9 9]\n",
" [ 6 6 6]]\n",
"\n",
" ...\n",
"\n",
" [[ 6 6 6]\n",
" [ 3 3 3]\n",
" [ 6 6 6]\n",
" ...\n",
" [ 6 6 6]\n",
" [ 6 6 6]\n",
" [ 3 3 3]]\n",
"\n",
" [[ 6 6 6]\n",
" [ 3 3 3]\n",
" [ 3 3 3]\n",
" ...\n",
" [ 6 6 6]\n",
" [ 6 6 6]\n",
" [ 6 6 6]]\n",
"\n",
" [[ 3 3 3]\n",
" [ 3 3 3]\n",
" [ 6 6 6]\n",
" ...\n",
" [ 9 9 9]\n",
" [ 6 6 6]\n",
" [ 6 6 6]]]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stackview.add_bounding_boxes(image, bb)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ab631d8-e2b2-47fd-96a9-22a32b3cfa7d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}