空间地理,常常需要计算2个地址是否是同个地址,2个坐标之间的直线距离。下面把这些常用算法进行封装。 2个地址是否是同个地址 的相似度算法,采用 余弦相似度用向量空间中两个向量夹角的余弦值作为衡量两个个体间差异的大小。相比距离度量,余弦相似度更加注重两个向量在方向上的差异,而非距离或长度上。
public final class ScoreUtil {
private static Logger log = LoggerFactory.getLogger(ScoreUtil.class);
/** 乡镇附属词 */
private static final String WORD_TYPE_4 = "乡,镇,街道办事处,街道办,街道";
/** 社区居委附属词 */
private static final String WORD_TYPE_5 = "社区,社区居民委员会,村民委员会,社区居村委,社区居委,居村委,居委会,村委会,委员会,居委,村委,委会,村";
/** 路附属词 */
private static final String WORD_TYPE_6 = "道路,路,大街,街,巷";
/** 自定义地址唯一组合 */
private static final String[] UNION_SIMILAR = { "1&2&4&6", "1&3&6", "1&3" };
/**
*
*/
private ScoreUtil() {
}
/**
* 计算两个地址的相似度
*
* @param sourceAddr 源地址
* @param standAddr 标准地址
* @return
* @throws Exception
*/
public static long score(String sourceAddr, String standAddr) throws Exception {
Map<Integer, String> sourceWords = WordSolrUtil.splitAddressWordNames(sourceAddr);
Map<Integer, String> standWords = WordSolrUtil.splitAddressWordNames(standAddr);
if (standWords.size() == 0 || sourceWords.size() == 0) {
return 0L;
}
Integer sourceWordMaxType = Collections.max(sourceWords.keySet());
Integer standWordMaxType = Collections.max(standWords.keySet());
String sourceWordStr = null;
String standWordStr = null;
// 命中的词类型组合
StringBuilder matchType = new StringBuilder();
for (int i = (standWordMaxType > sourceWordMaxType ? standWordMaxType : sourceWordMaxType); i >= 1; i--) {
sourceWordStr = sourceWords.get(i);
standWordStr = standWords.get(i);
if (i == 4) {
if (StringUtils.isNotBlank(sourceWordStr)) {
sourceWords.put(i, sourceWordStr.replaceFirst("[" + WORD_TYPE_4 + "]$", "") + "乡镇");
}
if (StringUtils.isNotBlank(standWordStr)) {
standWords.put(i, standWordStr.replaceFirst("[" + WORD_TYPE_4 + "]$", "") + "乡镇");
}
} else if (i == 5) {
if (StringUtils.isNotBlank(sourceWordStr)) {
sourceWords.put(i, sourceWordStr.replaceAll("[" + WORD_TYPE_5 + "]", "") + "居村委");
}
if (StringUtils.isNotBlank(standWordStr)) {
standWords.put(i, standWordStr.replaceAll("[" + WORD_TYPE_5 + "]", "") + "居村委");
}
} else if (i == 6) {
if (StringUtils.isNotBlank(sourceWordStr)) {
sourceWords.put(i, sourceWordStr.replaceFirst("[" + WORD_TYPE_6 + "]$", "") + "大道");
}
if (StringUtils.isNotBlank(standWordStr)) {
standWords.put(i, standWordStr.replaceFirst("[" + WORD_TYPE_6 + "]$", "") + "大道");
}
}
// 字符串相似度大于0.9,则标记为命中
if (strSimilarMatch(sourceWords.get(i), standWords.get(i)) >= 0.9) {
matchType.append(i).append("&");
}
}
// 用于比较的地址字符串
StringBuilder sourceAddrCompareStr = new StringBuilder();
StringBuilder standAddrCompareStr = new StringBuilder();
filterAddrToEquivalent(sourceWords, standWords, sourceWordMaxType, standWordMaxType, matchType,
sourceAddrCompareStr, standAddrCompareStr);
log.debug("源地址过滤后:" + sourceAddrCompareStr.toString());
log.debug("标地址过滤后:" + standAddrCompareStr.toString());
return Math.round(linearSpaceVectorMacth(sourceAddrCompareStr.toString(), standAddrCompareStr.toString()));
}
/**
* 根据配置信息及别名过滤两个地址为等价地址
*
* @param sourceWords
* @param standWords
* @param sourceWordMaxType
* @param standWordMaxType
* @param matchType
* @param sourceAddrCompareStr
* @param standAddrCompareStr
*/
private static void filterAddrToEquivalent(Map<Integer, String> sourceWords, Map<Integer, String> standWords,
Integer sourceWordMaxType, Integer standWordMaxType, StringBuilder matchType,
StringBuilder sourceAddrCompareStr, StringBuilder standAddrCompareStr) {
// 命中的词
String[] matchedType = matchType.toString().contains("&") ? matchType.toString().split("&", 0) : new String[0];
Arrays.sort(matchedType, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
if (StringUtils.isBlank(o1)) {
return 1;
}
if (StringUtils.isBlank(o2)) {
return -1;
}
if (o1.equals(o2)) {
return 0;
}
return Integer.parseInt(o1) > Integer.parseInt(o2) ? 1 : -1;
}
});
// 根据配置,移除可忽略的词
for (int i = 0; i < UNION_SIMILAR.length; i++) {
boolean isMatchedWithConfig = true;
if (UNION_SIMILAR[i].split("&").length >= matchedType.length) {
// 判断命中的词组是否与配置指定的一致
for (String type : matchedType) {
if (StringUtils.isNotBlank(type) && Arrays.binarySearch(UNION_SIMILAR[i].split("&"), type) < 0) {
isMatchedWithConfig = false;
break;
}
}
} else {
for (String type : UNION_SIMILAR[i].split("&")) {
if (StringUtils.isNotBlank(type) && Arrays.binarySearch(matchedType, type) < 0) {
isMatchedWithConfig = false;
break;
}
}
if (isMatchedWithConfig) {
matchedType = UNION_SIMILAR[i].split("&");
}
}
if (isMatchedWithConfig && matchedType.length > 0) {
// 补上缺省的词
for (int j = Integer.parseInt(matchedType[0]); j <= Integer
.parseInt(matchedType[matchedType.length - 1]); j++) {
if (Arrays.binarySearch(matchedType, String.valueOf(j)) < 0) {
if (sourceWords.keySet().contains(j)) {
standWords.put(j, sourceWords.get(j));
} else if (standWords.keySet().contains(j)) {
sourceWords.put(j, standWords.get(j));
}
}
}
}
}
// 组装地址词组为字符串信息
for (int i = 1; i <= (standWordMaxType > sourceWordMaxType ? standWordMaxType : sourceWordMaxType); i++) {
sourceAddrCompareStr.append(StringUtils.trimToEmpty(sourceWords.get(i)));
standAddrCompareStr.append(StringUtils.trimToEmpty(standWords.get(i)));
}
}
/**
* 线性空间几何
*
* @param source
* @param target
* @return
*/
private static double linearSpaceVectorMacth(String source, String target) {
Set<Character> set = new HashSet<Character>();
for (char c : source.toCharArray()) {
set.add(c);
}
for (char c : target.toCharArray()) {
set.add(c);
}
Character[] targetA = set.toArray(new Character[] {});
int[] sourceArg = parseAddrToSpaceVector(targetA, source);
int[] targetArg = parseAddrToSpaceVector(targetA, target);
return cos(sourceArg, targetArg) * 100;
}
/**
* 计算空间向量夹角cos值
*
* @param point1
* @param point2
* @return
*/
private static double cos(int[] point1, int[] point2) {
int count = 0;
for (int i = 0; i < point1.length; i++) {
count += point1[i] * point2[i];
}
double a1 = 0.0;
for (int i = 0; i < point1.length; i++) {
a1 += point1[i] * point1[i];
}
a1 = Math.sqrt(a1);
double a2 = 0.0;
for (int i = 0; i < point2.length; i++) {
a2 += point2[i] * point2[i];
}
a2 = Math.sqrt(a2);
return count / (a1 * a2);
}
/**
* 解析地址为空间向量坐标
*
* @param tag
* @param str
* @return
*/
private static int[] parseAddrToSpaceVector(Character[] tag, String str) {
int[] rs = new int[tag.length];
int count = 0;
int i = 0;
for (char t : tag) {
count = 0;
for (char c : str.toCharArray()) {
if (t == c) {
count++;
}
}
rs[i] = count;
i++;
}
return rs;
}
/**
* 字符串相似度匹配
*
* @param compare
* @param to
* @return
*/
public static double strSimilarMatch(String compare, String to) {
if (StringUtils.isBlank(compare) || StringUtils.isBlank(to)) {
return 0;
}
// 字符串相似度比较
int len1 = compare.length();
int len2 = to.length();
int[][] dif = new int[len1 + 1][len2 + 1];
for (int a = 0; a <= len1; a++) {
dif[a][0] = a;
}
for (int a = 0; a <= len2; a++) {
dif[0][a] = a;
}
int temp;
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (compare.charAt(i - 1) == to.charAt(j - 1)) {
temp = 0;
} else {
temp = 1;
}
dif[i][j] = min(dif[i - 1][j - 1] + temp, dif[i][j - 1] + 1, dif[i - 1][j] + 1);
}
}
return 1 - (double) dif[len1][len2] / Math.max(compare.length(), to.length());
}
/**
* 查找集合最小值
*
* @param is
* @return
*/
private static int min(int... is) {
int min = Integer.MAX_VALUE;
for (int i : is) {
if (min > i) {
min = i;
}
}
return min;
}
}