Image Similarity Detection Methods: Hashing, Histograms, Feature Matching, BOW+K‑Means, and CNN‑Based Approaches
This article reviews common image similarity detection techniques—including hash-based methods (aHash, pHash, dHash), histogram comparison, feature matching with ORB and SIFT/SURF, bag‑of‑words with K‑Means, and CNN‑based VGG16 features—detailing their algorithms, Python implementations, performance characteristics, and practical considerations.
Background: Image‑based search is widely used for price comparison, plant identification, trademark infringement detection, etc. Similarity is measured by encoding each image into a numeric representation (fingerprint) and ranking database images by distance to the query fingerprint.
Hash Algorithms
Hashing creates a compact binary fingerprint for each image. Three common hashes are described:
def aHash(img):
img = cv2.resize(img, (8, 8))
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
np_mean = np.mean(gray)
ahash_01 = (gray > np_mean) + 0
ahash_list = ahash_01.reshape(1, -1)[0].tolist()
ahash_str = ''.join([str(x) for x in ahash_list])
return ahash_str def pHash(img):
img = cv2.resize(img, (32, 32))
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
dct = cv2.dct(np.float32(gray))
dct_roi = dct[0:8, 0:8]
avreage = np.mean(dct_roi)
phash_01 = (dct_roi > avreage) + 0
phash_list = phash_01.reshape(1, -1)[0].tolist()
phash_str = ''.join([str(x) for x in phash_list])
return phash_str def dHash(img):
img = cv2.resize(img, (9, 8))
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
hash_str0 = []
for i in range(8):
hash_str0.append(gray[:, i] > gray[:, i + 1])
hash_str1 = np.array(hash_str0) + 0
hash_str2 = hash_str1.T
hash_str3 = hash_str2.reshape(1, -1)[0].tolist()
dhash_str = ''.join([str(x) for x in hash_str3])
return dhash_strSimilarity is finally computed by Hamming distance between two hash strings:
def hammingDist(hashstr1, hashstr2):
assert len(hashstr1) == len(hashstr2)
return sum([ch1 != ch2 for ch1, ch2 in zip(hashstr1, hashstr2)])Histogram‑Based Comparison
Single‑channel and three‑channel histograms are normalized and compared using a custom degree metric.
def calculate_single(img1, img2):
hist1 = cv2.calcHist([img1], [0], None, [256], [0.0, 255.0])
hist1 = cv2.normalize(hist1, hist1, 0, 1, cv2.NORM_MINMAX, -1)
hist2 = cv2.calcHist([img2], [0], None, [256], [0.0, 255.0])
hist2 = cv2.normalize(hist2, hist2, 0, 1, cv2.NORM_MINMAX, -1)
degree = 0
for i in range(len(hist1)):
if hist1[i] != hist2[i]:
degree += (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))
else:
degree += 1
degree = degree / len(hist1)
return degree def classify_hist_of_three(img1, img2, size=(256, 256)):
image1 = cv2.resize(img1, size)
image2 = cv2.resize(img2, size)
sub_image1 = cv2.split(img1)
sub_image2 = cv2.split(img2)
sub_data = 0
for im1, im2 in zip(sub_img1, sub_img2):
sub_data += calculate_single(im1, im2)
sub_data = sub_data / 3
return sub_dataFeature Extraction & Matching
ORB provides fast binary descriptors; SIFT/SURF give scale‑invariant keypoints.
def ORB_img_similarity(img1_path, img2_path):
orb = cv2.ORB_create()
img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE)
img2 = cv2.imread(img2_path, cv2.IMREAD_GRAYSCALE)
kp1, des1 = orb.detectAndCompute(img1, None)
kp2, des2 = orb.detectAndCompute(img2, None)
bf = cv2.BFMatcher(cv2.NORM_HAMMING)
matches = bf.knnMatch(des1, trainDescriptors=des2, k=2)
good = [m for (m, n) in matches if m.distance < 0.8 * n.distance]
similarity = len(good) / len(matches)
return similarity def sift_similarity(img1_path, img2_path):
sift = cv2.xfeatures2d.SIFT_create()
FLANN_INDEX_KDTREE = 0
indexParams = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
searchParams = dict(checks=50)
flann = cv2.FlannBasedMatcher(indexParams, searchParams)
sampleImage = cv2.imread(samplePath, 0)
kp1, des1 = sift.detectAndCompute(sampleImage, None)
kp2, des2 = sift.detectAndCompute(queryImage, None)
matches = flann.knnMatch(des1, des2, k=2)
good = [m for (m, n) in matches if m.distance < 0.8 * n.distance]
similarity = len(good) / len(matches)
return similarityBag‑of‑Words + K‑Means
Images are represented as visual word histograms built from SIFT descriptors clustered by K‑Means; TF‑IDF weighting and L2 normalization improve discrimination.
des_list = []
filelist = os.listdir(dir)
trainNum = int(count / 3)
for i in range(len(filelist)):
filename = dir + '\\' + filelist[i]
img = cv2.imread(filename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
kp, des = sift_det.detectAndCompute(gray, None)
des_list.append((image_path, des))
# Build vocabulary
descriptors = des_list[0][1]
for image_path, descriptor in des_list[1:]:
descriptors = np.vstack((descriptors, descriptor))
voc, variance = kmeans(descriptors, numWords, 1)
# Compute histogram for each image
im_features = np.zeros((len(image_paths), numWords), "float32")
for i in range(len(image_paths)):
words, distance = vq(des_list[i][1], voc)
for w in words:
im_features[i][w] += 1
# TF‑IDF weighting
nbr_occurences = np.sum((im_features > 0) * 1, axis=0)
idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')
im_features = im_features * idf
im_features = preprocessing.normalize(im_features, norm='l2')
joblib.dump((im_features, image_paths, idf, numWords, voc), "bow.pkl", compress=3)CNN‑Based Similarity (VGG16)
Features are extracted from the 7th fully‑connected layer of a VGG16 network; a sigmoid‑activated 128‑bit vector is used for binary retrieval, while the 4096‑dimensional vector serves for distance‑based re‑ranking.
database = 'dataset'
index = 'models/vgg_featureCNN.h5'
img_list = get_imlist(database)
features = []
names = []
model = VGGNet()
for i, img_path in enumerate(img_list):
norm_feat = model.vgg_extract_feat(img_path)
img_name = os.path.split(img_path)[1]
features.append(norm_feat)
names.append(img_name)
feats = np.array(features)
output = index
h5f = h5py.File(output, 'w')
h5f.create_dataset('dataset_features', data=feats)
h5f.create_dataset('dataset_names', data=np.string_(names))
h5f.close() model = VGGNet()
queryVec = model.vgg_extract_feat(imgs)
scores = np.dot(queryVec, feats.T)
rank_ID = np.argsort(scores)[::-1]
rank_score = scores[rank_ID]Evaluation & Conclusion
Experiments on a sample logo dataset show that VGG16‑based and SIFT‑based retrieval are the most accurate and stable, while simple histogram and hash methods can be fast but less reliable. The choice of method should depend on data characteristics, required speed, and robustness; combining complementary approaches often yields the best practical performance.
360 Quality & Efficiency
360 Quality & Efficiency focuses on seamlessly integrating quality and efficiency in R&D, sharing 360’s internal best practices with industry peers to foster collaboration among Chinese enterprises and drive greater efficiency value.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.