Golang计算两个字符串的匹配度方法探讨

Golang计算两个字符串的匹配度方法探讨 我有以下两个字符串,它们实际上表示相同的意思:

GLOVES: LENGTH: 32 CM MATERIAL: NEOPRENE RUBBER FREE FLOW TEXT: RESISTANT TO WIDE RANGE OF GLOVES, TYPE: CHEMICAL RESISTANT, SIZE: 7, MATERIAL: NEOPRENE RUBBER, STANDARD: BS EN 388/BS EN 374, FFT: RESISTANT TO WIDE RANGE OF CHEMICALS SUCH AS ETHYLENE OXIDE IDEAL FOR LONG TERM HEAVY WORK IN CHEMICAL ENVIRONMENT MANUFACTURER REFERENCES: ORIGINAL_MNFR: POLYCO

Neoprene Rubber Chemical Resistant Gloves, Size: 7; Length: 32 cm; Standard: BS EN 388; Resistant to wide range of Chemicals such as Ethylene Oxide. Make: Polyco, Model: Duraprene III or Equivalent

我有超过1000组这样的数据,手动处理简直是噩梦。我尝试了如下方法:

package main

import (
	"fmt"
	"strings"
)

func main() {
	var str1 = "Neoprene Rubber Chemical Resistant Gloves, Size: 7; Length: 32 cm; Standard: BS EN 388; Resistant to wide range of Chemicals such as Ethylene Oxide. Make: Polyco, Model: Duraprene III or Equivalent"
	var str2 = "GLOVES: LENGTH: 32 CM MATERIAL: NEOPRENE RUBBER FREE FLOW TEXT: RESISTANT TO WIDE RANGE OF GLOVES, TYPE: CHEMICAL RESISTANT, SIZE: 7, MATERIAL: NEOPRENE RUBBER, STANDARD: BS EN 388/BS EN 374, FFT: RESISTANT TO WIDE RANGE OF CHEMICALS SUCH AS ETHYLENE OXIDE IDEAL FOR LONG TERM HEAVY WORK IN CHEMICAL ENVIRONMENT MANUFACTURER REFERENCES: ORIGINAL_MNFR: POLYCO"

	cnt := 0
	for _, i := range strings.Fields(str1) {
		for _, j := range strings.Fields(str2) {
			if strings.ToLower(i) == strings.ToLower(j) {
				cnt += 1
			}
		}
	}
	fmt.Printf("str1 is: %d length, and str2 is: %d length, they have; %d common words.", len(str1), len(str2), cnt)
}

但匹配度非常低,我得到的结果是:

str1 is: 197 length, and str2 is: 358 length, they have; 29 common words.

我也尝试使用了Levenshtein距离,代码如下:

// Levenshtein Distance in Golang
package main
import "fmt"
 
func levenshtein(str1, str2 []rune) int {
    s1len := len(str1)
    s2len := len(str2)
    column := make([]int, len(str1)+1)
 
    for y := 1; y <= s1len; y++ {
        column[y] = y
    }
    for x := 1; x <= s2len; x++ {
        column[0] = x
        lastkey := x - 1
        for y := 1; y <= s1len; y++ {
            oldkey := column[y]
            var incr int
            if str1[y-1] != str2[x-1] {
                incr = 1
            }
 
            column[y] = minimum(column[y]+1, column[y-1]+1, lastkey+incr)
            lastkey = oldkey
        }
    }
    return column[s1len]
}
 
func minimum(a, b, c int) int {
    if a < b {
        if a < c {
            return a
        }
    } else {
        if b < c {
            return b
        }
    }
    return c
}
 
func main(){
    var str1 = []rune("Neoprene Rubber Chemical Resistant Gloves, Size: 7; Length: 32 cm; Standard: BS EN 388; Resistant to wide range of Chemicals such as Ethylene Oxide. Make: Polyco, Model: Duraprene III or Equivalent")
    var str2 = []rune("GLOVES: LENGTH: 32 CM MATERIAL: NEOPRENE RUBBER FREE FLOW TEXT: RESISTANT TO WIDE RANGE OF GLOVES, TYPE: CHEMICAL RESISTANT, SIZE: 7, MATERIAL: NEOPRENE RUBBER, STANDARD: BS EN 388/BS EN 374, FFT: RESISTANT TO WIDE RANGE OF CHEMICALS SUCH AS ETHYLENE OXIDE IDEAL FOR LONG TERM HEAVY WORK IN CHEMICAL ENVIRONMENT MANUFACTURER REFERENCES: ORIGINAL_MNFR: POLYCO")
    fmt.Println("Distance between str1 and str2:",levenshtein(str1,str2))
}

但它们之间的距离看起来非常长,我得到的结果是:

Distance between str1 and str2: 304

有什么办法可以改进吗?


更多关于Golang计算两个字符串的匹配度方法探讨的实战教程也可以访问 https://www.itying.com/category-94-b0.html

4 回复

我会定义一个结构体并编写一个自定义解析器。

更多关于Golang计算两个字符串的匹配度方法探讨的实战系列教程也可以访问 https://www.itying.com/category-94-b0.html


有任何示例吗?

您可以查看任何 YAML、TOML 或 JSON 解析器的代码。这是一个简单的任务。例如,在您的结构体中放置一个整数字段 Size(int),在字符串中查找 “size: 7” 并将 size 设置为 7。这样匹配起来会更容易。

对于文本匹配度计算,推荐使用基于词频的余弦相似度算法,配合文本预处理可以获得更好的效果。以下是改进方案:

package main

import (
	"fmt"
	"regexp"
	"strings"
)

// 预处理文本:转为小写、移除标点、标准化空格
func preprocessText(text string) string {
	// 转为小写
	text = strings.ToLower(text)
	
	// 移除标点符号
	reg := regexp.MustCompile(`[^\w\s]`)
	text = reg.ReplaceAllString(text, " ")
	
	// 标准化空格
	text = strings.Join(strings.Fields(text), " ")
	
	return text
}

// 构建词频向量
func buildWordVector(text string) map[string]int {
	words := strings.Fields(text)
	vector := make(map[string]int)
	
	for _, word := range words {
		// 过滤停用词(可根据需要扩展)
		stopWords := map[string]bool{
			"a": true, "an": true, "the": true, "and": true, "or": true,
			"to": true, "in": true, "of": true, "for": true, "as": true,
		}
		
		if !stopWords[word] && len(word) > 1 {
			vector[word]++
		}
	}
	
	return vector
}

// 计算余弦相似度
func cosineSimilarity(vec1, vec2 map[string]int) float64 {
	// 计算点积
	dotProduct := 0.0
	for word, count1 := range vec1 {
		if count2, exists := vec2[word]; exists {
			dotProduct += float64(count1 * count2)
		}
	}
	
	// 计算向量模长
	magnitude1 := 0.0
	for _, count := range vec1 {
		magnitude1 += float64(count * count)
	}
	magnitude1 = sqrt(magnitude1)
	
	magnitude2 := 0.0
	for _, count := range vec2 {
		magnitude2 += float64(count * count)
	}
	magnitude2 = sqrt(magnitude2)
	
	// 避免除零
	if magnitude1 == 0 || magnitude2 == 0 {
		return 0.0
	}
	
	return dotProduct / (magnitude1 * magnitude2)
}

// 简单平方根实现
func sqrt(x float64) float64 {
	z := 1.0
	for i := 0; i < 10; i++ {
		z -= (z*z - x) / (2 * z)
	}
	return z
}

// 提取关键信息进行匹配
func extractKeyInfo(text string) string {
	text = strings.ToLower(text)
	
	// 提取关键特征
	features := []string{}
	
	// 匹配材料
	materialRegex := regexp.MustCompile(`(neoprene|rubber|nitrile|latex)`)
	if matches := materialRegex.FindAllString(text, -1); matches != nil {
		features = append(features, matches...)
	}
	
	// 匹配尺寸
	sizeRegex := regexp.MustCompile(`size\s*:\s*(\d+)`)
	if match := sizeRegex.FindStringSubmatch(text); match != nil {
		features = append(features, "size"+match[1])
	}
	
	// 匹配长度
	lengthRegex := regexp.MustCompile(`length\s*:\s*(\d+)\s*cm`)
	if match := lengthRegex.FindStringSubmatch(text); match != nil {
		features = append(features, "length"+match[1])
	}
	
	// 匹配标准
	standardRegex := regexp.MustCompile(`(bs\s*en\s*\d+|en\s*\d+)`)
	if matches := standardRegex.FindAllString(text, -1); matches != nil {
		features = append(features, matches...)
	}
	
	// 匹配制造商
	manufacturerRegex := regexp.MustCompile(`(polyco|ansell|showa|kimberly)`)
	if matches := manufacturerRegex.FindAllString(text, -1); matches != nil {
		features = append(features, matches...)
	}
	
	return strings.Join(features, " ")
}

func main() {
	str1 := "Neoprene Rubber Chemical Resistant Gloves, Size: 7; Length: 32 cm; Standard: BS EN 388; Resistant to wide range of Chemicals such as Ethylene Oxide. Make: Polyco, Model: Duraprene III or Equivalent"
	str2 := "GLOVES: LENGTH: 32 CM MATERIAL: NEOPRENE RUBBER FREE FLOW TEXT: RESISTANT TO WIDE RANGE OF GLOVES, TYPE: CHEMICAL RESISTANT, SIZE: 7, MATERIAL: NEOPRENE RUBBER, STANDARD: BS EN 388/BS EN 374, FFT: RESISTANT TO WIDE RANGE OF CHEMICALS SUCH AS ETHYLENE OXIDE IDEAL FOR LONG TERM HEAVY WORK IN CHEMICAL ENVIRONMENT MANUFACTURER REFERENCES: ORIGINAL_MNFR: POLYCO"
	
	// 方法1:余弦相似度
	processed1 := preprocessText(str1)
	processed2 := preprocessText(str2)
	
	vec1 := buildWordVector(processed1)
	vec2 := buildWordVector(processed2)
	
	similarity := cosineSimilarity(vec1, vec2)
	fmt.Printf("余弦相似度: %.4f (%.2f%%)\n", similarity, similarity*100)
	
	// 方法2:关键信息匹配
	keyInfo1 := extractKeyInfo(str1)
	keyInfo2 := extractKeyInfo(str2)
	
	keyVec1 := buildWordVector(keyInfo1)
	keyVec2 := buildWordVector(keyInfo2)
	
	keySimilarity := cosineSimilarity(keyVec1, keyVec2)
	fmt.Printf("关键信息相似度: %.4f (%.2f%%)\n", keySimilarity, keySimilarity*100)
	
	// 方法3:Jaccard相似度
	jaccardSimilarity := jaccardIndex(processed1, processed2)
	fmt.Printf("Jaccard相似度: %.4f (%.2f%%)\n", jaccardSimilarity, jaccardSimilarity*100)
}

// Jaccard相似度计算
func jaccardIndex(text1, text2 string) float64 {
	words1 := strings.Fields(text1)
	words2 := strings.Fields(text2)
	
	set1 := make(map[string]bool)
	for _, word := range words1 {
		set1[word] = true
	}
	
	set2 := make(map[string]bool)
	for _, word := range words2 {
		set2[word] = true
	}
	
	// 计算交集
	intersection := 0
	for word := range set1 {
		if set2[word] {
			intersection++
		}
	}
	
	// 计算并集
	union := len(set1) + len(set2) - intersection
	
	if union == 0 {
		return 0.0
	}
	
	return float64(intersection) / float64(union)
}

对于1000组数据的批量处理,可以这样实现:

type StringPair struct {
	Str1 string
	Str2 string
}

func batchProcess(pairs []StringPair) []float64 {
	results := make([]float64, len(pairs))
	
	for i, pair := range pairs {
		processed1 := preprocessText(pair.Str1)
		processed2 := preprocessText(pair.Str2)
		
		vec1 := buildWordVector(processed1)
		vec2 := buildWordVector(processed2)
		
		// 使用加权相似度
		cosineSim := cosineSimilarity(vec1, vec2)
		keySim := keyInfoSimilarity(pair.Str1, pair.Str2)
		
		// 综合评分(可根据需要调整权重)
		results[i] = 0.7*cosineSim + 0.3*keySim
	}
	
	return results
}

func keyInfoSimilarity(str1, str2 string) float64 {
	key1 := extractKeyInfo(str1)
	key2 := extractKeyInfo(str2)
	
	vec1 := buildWordVector(key1)
	vec2 := buildWordVector(key2)
	
	return cosineSimilarity(vec1, vec2)
}

这个方案通过以下改进提高匹配准确度:

  1. 文本预处理标准化
  2. 使用余弦相似度考虑词频权重
  3. 提取关键信息(尺寸、材料、标准等)进行重点匹配
  4. 结合多种相似度算法
  5. 过滤停用词减少噪声

对于你的示例数据,这个方案应该能给出更合理的匹配度评分。

回到顶部