Build Multimodal Image Search with Alibaba Cloud Milvus and Qwen VL
This tutorial shows how to combine Alibaba Cloud Milvus vector search with the Qwen VL large model to extract image descriptions, generate multimodal embeddings, store them in Milvus, and perform fast multimodal searches such as text‑to‑image and image‑to‑text queries using Python.
This article demonstrates how to use Alibaba Cloud Milvus vector search service (Milvus edition) together with the Qwen VL large model to extract image features, generate multimodal embeddings, and quickly implement multimodal search.
Prerequisites
Milvus instance created (see the quick‑create guide).
DashScope service enabled and an API‑KEY obtained (see the DashScope guide).
Step 01: Install Dependencies
python3 -m pip install dashscope pymilvus==2.5.0 wget https://github.com/milvus-io/pymilvus-assets/releases/download/imagedata/reverse_image_search.zip
unzip -q -o reverse_image_search.zipThe example runs on Python 3.9.
Step 02: Extract Image Descriptions and Vectorize
First, the example extracts a textual description for each image using the Qwen VL model, then converts both the description and the image into vectors via a multimodal embedding model.
import base64, csv, dashscope, os, pandas as pd, sys, time
from tqdm import tqdm
from pymilvus import (connections, FieldSchema, CollectionSchema, DataType, Collection, MilvusException, utility)
from http import HTTPStatus
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class FeatureExtractor:
def __init__(self, DASHSCOPE_API_KEY):
self._api_key = DASHSCOPE_API_KEY
def __call__(self, input_data, input_type):
if input_type not in ("image", "text"):
raise ValueError("Invalid input type. Must be 'image' or 'text'.")
try:
if input_type == "image":
_, ext = os.path.splitext(input_data)
image_format = ext.lstrip('.').lower()
with open(input_data, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode("utf-8")
input_data = f"data:image/{image_format};base64,{base64_image}"
payload = [{"image": input_data}]
else:
payload = [{"text": input_data}]
resp = dashscope.MultiModalEmbedding.call(model="multimodal-embedding-v1", input=payload, api_key=self._api_key)
if resp.status_code == HTTPStatus.OK:
return resp.output["embeddings"][0]["embedding"]
else:
raise RuntimeError(f"API call failed, status: {resp.status_code}, message: {resp.message}")
except Exception as e:
logger.error(f"Processing failed: {str(e)}")
raise
class FeatureExtractorVL:
def __init__(self, DASHSCOPE_API_KEY):
self._api_key = DASHSCOPE_API_KEY
def __call__(self, input_data, input_type):
if input_type not in ("image",):
raise ValueError("Invalid input type. Must be 'image'.")
try:
if input_type == "image":
payload = [
{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
{"role": "user", "content": [{"image": input_data}, {"text": "先用50字内的文字描述这张图片,然后再给出5个关键词"}]}
]
resp = dashscope.MultiModalConversation.call(model="qwen-vl-plus", messages=payload, api_key=self._api_key)
if resp.status_code == HTTPStatus.OK:
return resp.output["choices"][0]["message"].content[0]["text"]
else:
raise RuntimeError(f"API call failed, status: {resp.status_code}, message: {resp.message}")
except Exception as e:
logger.error(f"Processing failed: {str(e)}")
raiseStep 03: Data Preparation
Initialize the Milvus client, create the collection if it does not exist, and insert the generated embeddings.
class MilvusClient:
def __init__(self, MILVUS_TOKEN, MILVUS_HOST, MILVUS_PORT, INDEX, COLLECTION_NAME):
self._token = MILVUS_TOKEN
self._host = MILVUS_HOST
self._port = MILVUS_PORT
self._index = INDEX
self._collection_name = COLLECTION_NAME
self._connect()
self._create_collection_if_not_exists()
def _connect(self):
try:
connections.connect(alias="default", host=self._host, port=self._port, token=self._token)
logger.info("Connected to Milvus successfully.")
except Exception as e:
logger.error(f"Failed to connect Milvus: {str(e)}")
sys.exit(1)
def _collection_exists(self):
return self._collection_name in utility.list_collections()
def _create_collection_if_not_exists(self):
try:
if not self._collection_exists():
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="origin", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="image_description", dtype=DataType.VARCHAR, max_length=1024),
FieldSchema(name="image_embedding", dtype=DataType.FLOAT_VECTOR, dim=1024),
FieldSchema(name="text_embedding", dtype=DataType.FLOAT_VECTOR, dim=1024)
]
schema = CollectionSchema(fields)
self._collection = Collection(self._collection_name, schema)
if self._index == 'IVF_FLAT':
self._create_ivf_index()
else:
self._create_hnsw_index()
logger.info("Collection created successfully.")
else:
self._collection = Collection(self._collection_name)
logger.info("Collection already exists.")
except Exception as e:
logger.error(f"Failed to create or load collection: {str(e)}")
sys.exit(1)
def _create_ivf_index(self):
index_params = {"index_type": "IVF_FLAT", "params": {"nlist": 1024}, "metric_type": "L2"}
self._collection.create_index("image_embedding", index_params)
self._collection.create_index("text_embedding", index_params)
logger.info("Index created successfully.")
def _create_hnsw_index(self):
index_params = {"index_type": "HNSW", "params": {"M": 64, "efConstruction": 100}, "metric_type": "L2"}
self._collection.create_index("image_embedding", index_params)
self._collection.create_index("text_embedding", index_params)
logger.info("Index created successfully.")
def insert(self, data):
try:
self._collection.insert(data)
self._collection.load()
logger.info("Data inserted and loaded successfully.")
except MilvusException as e:
logger.error(f"Insert failed: {str(e)}")
raise
def search(self, query_embedding, field, limit=3):
try:
if self._index == 'IVF_FLAT':
param = {"metric_type": "L2", "params": {"nprobe": 10}}
else:
param = {"metric_type": "L2", "params": {"ef": 10}}
result = self._collection.search(data=[query_embedding], anns_field=field, param=param, limit=limit, output_fields=["origin", "image_description"])
return [{"id": hit.id, "distance": hit.distance, "origin": hit.origin, "image_description": hit.image_description} for hit in result[0]]
except Exception as e:
logger.error(f"Search failed: {str(e)}")
return None
def load_image_embeddings(extractor, extractorVL, csv_path):
df = pd.read_csv(csv_path)
image_embeddings = {}
for image_path in tqdm(df["path"].tolist()[:200], desc="Generating image embeddings"):
try:
desc = extractorVL(image_path, "image")
image_embeddings[image_path] = [desc, extractor(image_path, "image"), extractor(desc, "text")]
time.sleep(1) # control API call rate
except Exception as e:
logger.warning(f"Failed processing {image_path}: {str(e)}")
return [{"origin": k, "image_description": v[0], "image_embedding": v[1], "text_embedding": v[2]} for k, v in image_embeddings.items()]After creating the collection, you can view its schema in the Attu console:
Step 04: Multimodal Search – Text to Image
Query the text "brown dog", embed it with the multimodal model, and search both the image_embedding and text_embedding fields.
text_query = "棕色的狗"
text_embedding = extractor(text_query, "text")
text_results_1 = milvus_client.search(text_embedding, field='image_embedding')
logger.info(f"Text‑to‑image results: {text_results_1}")
text_results_2 = milvus_client.search(text_embedding, field='text_embedding')
logger.info(f"Text‑to‑text results: {text_results_2}")Sample results (may vary due to model randomness):
{'id': 457336885198973657, 'distance': 1.3388, 'origin': './train/Rhodesian_ridgeback/n02087394_9675.JPEG', 'image_description': '一张小狗站在地毯上的照片。它有着棕色的毛发和蓝色的眼睛。关键词:小狗、地毯、眼睛、毛色、站立'}Step 05: Multimodal Search – Image to Text
Use a test lion image as the query, embed it, and perform both image‑to‑image and image‑to‑text searches.
image_query_path = "./test/lion/n02129165_13728.JPEG"
image_embedding = extractor(image_query_path, "image")
image_results_1 = milvus_client.search(image_embedding, field='image_embedding')
logger.info(f"Image‑to‑image results: {image_results_1}")
image_results_2 = milvus_client.search(image_embedding, field='text_embedding')
logger.info(f"Image‑to‑text results: {image_results_2}")Result screenshots:
The tutorial concludes with a full Python script that ties together Milvus client initialization, feature extraction, embedding insertion, and both text‑to‑image and image‑to‑image search examples.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Alibaba Cloud Big Data AI Platform
The Alibaba Cloud Big Data AI Platform builds on Alibaba’s leading cloud infrastructure, big‑data and AI engineering capabilities, scenario algorithms, and extensive industry experience to offer enterprises and developers a one‑stop, cloud‑native big‑data and AI capability suite. It boosts AI development efficiency, enables large‑scale AI deployment across industries, and drives business value.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
