在实际工程应用中,经常会遇上许多地方需要进行相似度的计算,在我之前的一篇文章里面已经总结记录了绝大多数数值型的数据相似度计算方法了,对于图片或者文本数据不能直接进行相似度的计算,可以进行转化后再采用上述的方法进行计算,这是一种方法。
本文主要针对图像数据的相似度计算进行说明,后续如果有需要讲解文本相似度计算方法的可以留言,我会写文章介绍。
本文主要是总结记录了我在实际工作过程中所用到的图像数据相似度计算方法,具体的代码实现如下:
#!usr/bin/env python # encoding:utf-8 from __future__ import division """ __Author__:沂水寒城 功能: 图片相似度计算方法集合 """ import os import cv2 import imutils import argparse import numpy as np from matplotlib import pyplot as plt from skimage.measure import compare_ssim def grayHistSim(image1, image2, size=(256, 256)): """ 以灰度直方图的重合度作为图像相似度 """ image1 = cv2.resize(image1, size) image2 = cv2.resize(image2, size) hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0]) hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0]) plt.plot(range(256), hist1, "r") plt.plot(range(256), hist2, "b") plt.show() degree = 0 for i in range(len(hist1)): if hist1[i] != hist2[i]: degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i])) else: degree = degree + 1 degree = degree / len(hist1) print("grayHistSim: ", degree) return degree def calculate(image1, image2): """ 计算单通道的直方图的相似值 """ hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0]) hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0]) degree = 0 # 计算直方图的重合度 for i in range(len(hist1)): if hist1[i] != hist2[i]: degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i])) else: degree = degree + 1 degree = degree / len(hist1) return degree def threeWayAvg(image1, image2, size=(256, 256)): """ 将图像resize后,分离为三个通道,再计算每个通道的相似值 """ image1 = cv2.resize(image1, size) image2 = cv2.resize(image2, size) sub_image1 = cv2.split(image1) sub_image2 = cv2.split(image2) sub_data = 0 for im1, im2 in zip(sub_image1, sub_image2): sub_data += calculate(im1, im2) sub_data = sub_data / 3 print("threeWayAvg: ", threeWayAvg) return sub_data def avgHash(image1, image2): """ 平均哈希算法计算 """ image1 = cv2.resize(image1, (8, 8)) image2 = cv2.resize(image2, (8, 8)) gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY) gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY) hash1 = getHash(gray1) hash2 = getHash(gray2) res = hammingDistance(hash1, hash2) print("avgHash: ", res) def perHash(image1, image2): """ 感知哈希算法计算 """ image1 = cv2.resize(image1, (32, 32)) image2 = cv2.resize(image2, (32, 32)) gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY) gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY) dct1 = cv2.dct(np.float32(gray1)) # 将灰度图转为浮点型,再进行dct变换 dct2 = cv2.dct(np.float32(gray2)) dct1_roi = dct1[0:8, 0:8] dct2_roi = dct2[0:8, 0:8] hash1 = getHash(dct1_roi) hash2 = getHash(dct2_roi) res = hammingDistance(hash1, hash2) print("perHash: ", res) def getHash(image): """ 输入灰度图,返回hash """ average = np.mean(image) hash_list = [] for i in range(image.shape[0]): for j in range(image.shape[1]): if image[i, j] > average: hash_list.append(1) else: hash_list.append(0) return hash_list def hammingDistance(hash1, hash2): """ 计算汉明距离 """ num = 0 for index in range(len(hash1)): if hash1[index] != hash2[index]: num += 1 return num / len(hash1) def getPictures(file_path): """ 获取某一路径下所有的文件 """ for root, dirs, files in os.walk(file_path): print(files) return files def yellowSimilarity(image1, image2, size=(256, 256)): """ RGB值: 红色的是(R:255,G:0,B:0), 绿色的是(R:0,G:255,B:0), 蓝色的是(R:0,G:0,B:255) 红色 #FF0000 黄色 #FFFF00 通过计算前两个通道的相似度来反映图像的相似度 """ image1 = cv2.resize(image1, size) image2 = cv2.resize(image2, size) sub_image1 = cv2.split(image1) sub_image2 = cv2.split(image2) simRes = 0 for i in range(2): simRes += calculate(sub_image1[i], sub_image2[i]) simRes = simRes / 2 print("yellowSimilarity: ", simRes) return simRes def ssimSimilarity(image1, image2, size=(256, 256)): """ 图片结构相似度 SSIM一种衡量两幅图像结构相似度的新指标,其值越大越好,最大为1 """ image1 = cv2.resize(image1, size) image2 = cv2.resize(image2, size) grayA = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY) grayB = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY) (score, diff) = compare_ssim(grayA, grayB, full=True) diff = (diff * 255).astype("uint8") print("SSIM: ", score) if __name__ == "__main__": image1 = cv2.imread("0.jpg") image2 = cv2.imread("1.jpg") yellowSimilarity(image1, image2, size=(256, 256)) threeWayAvg(image1, image2, size=(256, 256)) grayHistSim(image1, image2, size=(256, 256)) ssimSimilarity(image1, image2) avgHash(image1, image2) perHash(image1, image2)
我们选用了两张很有意思的图片
1.jpg
2.jpg
简单的测试结果输出如下: