{ "cells": [ { "cell_type": "markdown", "id": "1c7a3958-a52b-4446-8506-94f0c98863f6", "metadata": {}, "source": [ "# Moondream for bounding-box segmentation\n", "\n", "In this notebook we will use the vision language model [moondream](https://huggingface.co/vikhyatk/moondream2) to determine bounding-boxes around objects." ] }, { "cell_type": "markdown", "id": "38b84c3f-7a1d-4324-af3b-bb3be13f119f", "metadata": {}, "source": [ "Installation (Windows):\n", "* Download vips-dev-w64-all-8.16.1.zip from [here](https://github.com/libvips/build-win64-mxe/releases/tag/v8.16.1), unzip it, and add its subfolder `bin` to the PATH environment variable.\n", "* `pip install einops pyvips`" ] }, { "cell_type": "code", "execution_count": 1, "id": "ab4408ba-b33f-46d0-89cb-59133378f1ae", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "from PIL import Image\n", "from image_utilities import numpy_to_bytestream, extract_json, generate_spots\n", "from tqdm import tqdm\n", "import stackview\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " \"vikhyatk/moondream2\",\n", " revision=\"2025-04-14\",\n", " trust_remote_code=True,\n", " # Comment to run on CPU. To use the GPU, you need about 5 GB of GPU Memory.\n", " device_map={\"\": \"cuda\"}\n", ")" ] }, { "cell_type": "markdown", "id": "18c7adae-d56b-452b-bba9-aed363bc831d", "metadata": {}, "source": [ "## Human mitosis\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "22263f34-6be6-4a8f-9bbb-a5e138dacee4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
shape(100, 100)
dtypeuint8
size9.8 kB
min7
max88
\n", "\n", "
" ], "text/plain": [ "[[ 8 8 8 ... 10 9 9]\n", " [ 8 8 7 ... 10 11 10]\n", " [ 9 8 8 ... 9 10 9]\n", " ...\n", " [ 9 8 9 ... 9 9 8]\n", " [ 9 8 8 ... 9 9 9]\n", " [ 8 8 9 ... 10 9 9]]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import stackview\n", "from skimage import data\n", "import numpy as np\n", "\n", "# Load the human mitosis dataset\n", "image = data.human_mitosis()[:100, :100]\n", "\n", "# Display the image\n", "stackview.insight(image)" ] }, { "cell_type": "code", "execution_count": 3, "id": "2479e515-c803-4577-a9b0-ef2a2b54a8d2", "metadata": {}, "outputs": [], "source": [ "pil_image = Image.fromarray(image)\n", "\n", "encoded_image = model.encode_image(pil_image)" ] }, { "cell_type": "code", "execution_count": 4, "id": "dee1257a-b0f0-41b9-933f-cc40af031b47", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 11 bright spot(s)\n" ] } ], "source": [ "bb = model.detect(encoded_image, \"Mark all the bright blobs individually\")[\"objects\"]\n", "print(f\"Found {len(bb)} bright spot(s)\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "46a9196b-3f4a-4863-9a3f-34432b57335f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'x_min': 0.2584344260394573,\n", " 'y_min': 0.0017580389976501465,\n", " 'x_max': 0.3743780739605427,\n", " 'y_max': 0.09199196100234985,\n", " 'x': 0.2584344260394573,\n", " 'y': 0.0017580389976501465,\n", " 'width': 0.11594364792108536,\n", " 'height': 0.09023392200469971},\n", " {'x_min': 0.42661894857883453,\n", " 'y_min': 0.0029355064034461975,\n", " 'x_max': 0.5265060514211655,\n", " 'y_max': 0.1220644935965538,\n", " 'x': 0.42661894857883453,\n", " 'y': 0.0029355064034461975,\n", " 'width': 0.09988710284233093,\n", " 'height': 0.1191289871931076}]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for b in bb:\n", " b[\"x\"] = b[\"x_min\"]\n", " b[\"y\"] = b[\"y_min\"]\n", " b[\"width\"] = b[\"x_max\"]-b[\"x_min\"]\n", " b[\"height\"] = b[\"y_max\"]-b[\"y_min\"]\n", "bb[:2]" ] }, { "cell_type": "code", "execution_count": 6, "id": "602ecc12-4c6f-4b36-8542-a04ba10df765", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
shape(100, 100, 3)
dtypeuint8
size29.3 kB
min0
max255
\n", "\n", "
" ], "text/plain": [ "[[[ 3 3 3]\n", " [ 3 3 3]\n", " [ 3 3 3]\n", " ...\n", " [ 9 9 9]\n", " [ 6 6 6]\n", " [ 6 6 6]]\n", "\n", " [[ 3 3 3]\n", " [ 3 3 3]\n", " [ 0 0 0]\n", " ...\n", " [ 9 9 9]\n", " [12 12 12]\n", " [ 9 9 9]]\n", "\n", " [[ 6 6 6]\n", " [ 3 3 3]\n", " [ 3 3 3]\n", " ...\n", " [ 6 6 6]\n", " [ 9 9 9]\n", " [ 6 6 6]]\n", "\n", " ...\n", "\n", " [[ 6 6 6]\n", " [ 3 3 3]\n", " [ 6 6 6]\n", " ...\n", " [ 6 6 6]\n", " [ 6 6 6]\n", " [ 3 3 3]]\n", "\n", " [[ 6 6 6]\n", " [ 3 3 3]\n", " [ 3 3 3]\n", " ...\n", " [ 6 6 6]\n", " [ 6 6 6]\n", " [ 6 6 6]]\n", "\n", " [[ 3 3 3]\n", " [ 3 3 3]\n", " [ 6 6 6]\n", " ...\n", " [ 9 9 9]\n", " [ 6 6 6]\n", " [ 6 6 6]]]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stackview.add_bounding_boxes(image, bb)" ] }, { "cell_type": "code", "execution_count": null, "id": "8ab631d8-e2b2-47fd-96a9-22a32b3cfa7d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }