Golang计算两个字符串的匹配度方法探讨
Golang计算两个字符串的匹配度方法探讨 我有以下两个字符串,它们实际上表示相同的意思:
GLOVES: LENGTH: 32 CM MATERIAL: NEOPRENE RUBBER FREE FLOW TEXT: RESISTANT TO WIDE RANGE OF GLOVES, TYPE: CHEMICAL RESISTANT, SIZE: 7, MATERIAL: NEOPRENE RUBBER, STANDARD: BS EN 388/BS EN 374, FFT: RESISTANT TO WIDE RANGE OF CHEMICALS SUCH AS ETHYLENE OXIDE IDEAL FOR LONG TERM HEAVY WORK IN CHEMICAL ENVIRONMENT MANUFACTURER REFERENCES: ORIGINAL_MNFR: POLYCO
和
Neoprene Rubber Chemical Resistant Gloves, Size: 7; Length: 32 cm; Standard: BS EN 388; Resistant to wide range of Chemicals such as Ethylene Oxide. Make: Polyco, Model: Duraprene III or Equivalent
我有超过1000组这样的数据,手动处理简直是噩梦。我尝试了如下方法:
package main
import (
"fmt"
"strings"
)
func main() {
var str1 = "Neoprene Rubber Chemical Resistant Gloves, Size: 7; Length: 32 cm; Standard: BS EN 388; Resistant to wide range of Chemicals such as Ethylene Oxide. Make: Polyco, Model: Duraprene III or Equivalent"
var str2 = "GLOVES: LENGTH: 32 CM MATERIAL: NEOPRENE RUBBER FREE FLOW TEXT: RESISTANT TO WIDE RANGE OF GLOVES, TYPE: CHEMICAL RESISTANT, SIZE: 7, MATERIAL: NEOPRENE RUBBER, STANDARD: BS EN 388/BS EN 374, FFT: RESISTANT TO WIDE RANGE OF CHEMICALS SUCH AS ETHYLENE OXIDE IDEAL FOR LONG TERM HEAVY WORK IN CHEMICAL ENVIRONMENT MANUFACTURER REFERENCES: ORIGINAL_MNFR: POLYCO"
cnt := 0
for _, i := range strings.Fields(str1) {
for _, j := range strings.Fields(str2) {
if strings.ToLower(i) == strings.ToLower(j) {
cnt += 1
}
}
}
fmt.Printf("str1 is: %d length, and str2 is: %d length, they have; %d common words.", len(str1), len(str2), cnt)
}
但匹配度非常低,我得到的结果是:
str1 is: 197 length, and str2 is: 358 length, they have; 29 common words.
我也尝试使用了Levenshtein距离,代码如下:
// Levenshtein Distance in Golang
package main
import "fmt"
func levenshtein(str1, str2 []rune) int {
s1len := len(str1)
s2len := len(str2)
column := make([]int, len(str1)+1)
for y := 1; y <= s1len; y++ {
column[y] = y
}
for x := 1; x <= s2len; x++ {
column[0] = x
lastkey := x - 1
for y := 1; y <= s1len; y++ {
oldkey := column[y]
var incr int
if str1[y-1] != str2[x-1] {
incr = 1
}
column[y] = minimum(column[y]+1, column[y-1]+1, lastkey+incr)
lastkey = oldkey
}
}
return column[s1len]
}
func minimum(a, b, c int) int {
if a < b {
if a < c {
return a
}
} else {
if b < c {
return b
}
}
return c
}
func main(){
var str1 = []rune("Neoprene Rubber Chemical Resistant Gloves, Size: 7; Length: 32 cm; Standard: BS EN 388; Resistant to wide range of Chemicals such as Ethylene Oxide. Make: Polyco, Model: Duraprene III or Equivalent")
var str2 = []rune("GLOVES: LENGTH: 32 CM MATERIAL: NEOPRENE RUBBER FREE FLOW TEXT: RESISTANT TO WIDE RANGE OF GLOVES, TYPE: CHEMICAL RESISTANT, SIZE: 7, MATERIAL: NEOPRENE RUBBER, STANDARD: BS EN 388/BS EN 374, FFT: RESISTANT TO WIDE RANGE OF CHEMICALS SUCH AS ETHYLENE OXIDE IDEAL FOR LONG TERM HEAVY WORK IN CHEMICAL ENVIRONMENT MANUFACTURER REFERENCES: ORIGINAL_MNFR: POLYCO")
fmt.Println("Distance between str1 and str2:",levenshtein(str1,str2))
}
但它们之间的距离看起来非常长,我得到的结果是:
Distance between str1 and str2: 304
有什么办法可以改进吗?
更多关于Golang计算两个字符串的匹配度方法探讨的实战教程也可以访问 https://www.itying.com/category-94-b0.html
4 回复
我会定义一个结构体并编写一个自定义解析器。
更多关于Golang计算两个字符串的匹配度方法探讨的实战系列教程也可以访问 https://www.itying.com/category-94-b0.html
有任何示例吗?
您可以查看任何 YAML、TOML 或 JSON 解析器的代码。这是一个简单的任务。例如,在您的结构体中放置一个整数字段 Size(int),在字符串中查找 “size: 7” 并将 size 设置为 7。这样匹配起来会更容易。
对于文本匹配度计算,推荐使用基于词频的余弦相似度算法,配合文本预处理可以获得更好的效果。以下是改进方案:
package main
import (
"fmt"
"regexp"
"strings"
)
// 预处理文本:转为小写、移除标点、标准化空格
func preprocessText(text string) string {
// 转为小写
text = strings.ToLower(text)
// 移除标点符号
reg := regexp.MustCompile(`[^\w\s]`)
text = reg.ReplaceAllString(text, " ")
// 标准化空格
text = strings.Join(strings.Fields(text), " ")
return text
}
// 构建词频向量
func buildWordVector(text string) map[string]int {
words := strings.Fields(text)
vector := make(map[string]int)
for _, word := range words {
// 过滤停用词(可根据需要扩展)
stopWords := map[string]bool{
"a": true, "an": true, "the": true, "and": true, "or": true,
"to": true, "in": true, "of": true, "for": true, "as": true,
}
if !stopWords[word] && len(word) > 1 {
vector[word]++
}
}
return vector
}
// 计算余弦相似度
func cosineSimilarity(vec1, vec2 map[string]int) float64 {
// 计算点积
dotProduct := 0.0
for word, count1 := range vec1 {
if count2, exists := vec2[word]; exists {
dotProduct += float64(count1 * count2)
}
}
// 计算向量模长
magnitude1 := 0.0
for _, count := range vec1 {
magnitude1 += float64(count * count)
}
magnitude1 = sqrt(magnitude1)
magnitude2 := 0.0
for _, count := range vec2 {
magnitude2 += float64(count * count)
}
magnitude2 = sqrt(magnitude2)
// 避免除零
if magnitude1 == 0 || magnitude2 == 0 {
return 0.0
}
return dotProduct / (magnitude1 * magnitude2)
}
// 简单平方根实现
func sqrt(x float64) float64 {
z := 1.0
for i := 0; i < 10; i++ {
z -= (z*z - x) / (2 * z)
}
return z
}
// 提取关键信息进行匹配
func extractKeyInfo(text string) string {
text = strings.ToLower(text)
// 提取关键特征
features := []string{}
// 匹配材料
materialRegex := regexp.MustCompile(`(neoprene|rubber|nitrile|latex)`)
if matches := materialRegex.FindAllString(text, -1); matches != nil {
features = append(features, matches...)
}
// 匹配尺寸
sizeRegex := regexp.MustCompile(`size\s*:\s*(\d+)`)
if match := sizeRegex.FindStringSubmatch(text); match != nil {
features = append(features, "size"+match[1])
}
// 匹配长度
lengthRegex := regexp.MustCompile(`length\s*:\s*(\d+)\s*cm`)
if match := lengthRegex.FindStringSubmatch(text); match != nil {
features = append(features, "length"+match[1])
}
// 匹配标准
standardRegex := regexp.MustCompile(`(bs\s*en\s*\d+|en\s*\d+)`)
if matches := standardRegex.FindAllString(text, -1); matches != nil {
features = append(features, matches...)
}
// 匹配制造商
manufacturerRegex := regexp.MustCompile(`(polyco|ansell|showa|kimberly)`)
if matches := manufacturerRegex.FindAllString(text, -1); matches != nil {
features = append(features, matches...)
}
return strings.Join(features, " ")
}
func main() {
str1 := "Neoprene Rubber Chemical Resistant Gloves, Size: 7; Length: 32 cm; Standard: BS EN 388; Resistant to wide range of Chemicals such as Ethylene Oxide. Make: Polyco, Model: Duraprene III or Equivalent"
str2 := "GLOVES: LENGTH: 32 CM MATERIAL: NEOPRENE RUBBER FREE FLOW TEXT: RESISTANT TO WIDE RANGE OF GLOVES, TYPE: CHEMICAL RESISTANT, SIZE: 7, MATERIAL: NEOPRENE RUBBER, STANDARD: BS EN 388/BS EN 374, FFT: RESISTANT TO WIDE RANGE OF CHEMICALS SUCH AS ETHYLENE OXIDE IDEAL FOR LONG TERM HEAVY WORK IN CHEMICAL ENVIRONMENT MANUFACTURER REFERENCES: ORIGINAL_MNFR: POLYCO"
// 方法1:余弦相似度
processed1 := preprocessText(str1)
processed2 := preprocessText(str2)
vec1 := buildWordVector(processed1)
vec2 := buildWordVector(processed2)
similarity := cosineSimilarity(vec1, vec2)
fmt.Printf("余弦相似度: %.4f (%.2f%%)\n", similarity, similarity*100)
// 方法2:关键信息匹配
keyInfo1 := extractKeyInfo(str1)
keyInfo2 := extractKeyInfo(str2)
keyVec1 := buildWordVector(keyInfo1)
keyVec2 := buildWordVector(keyInfo2)
keySimilarity := cosineSimilarity(keyVec1, keyVec2)
fmt.Printf("关键信息相似度: %.4f (%.2f%%)\n", keySimilarity, keySimilarity*100)
// 方法3:Jaccard相似度
jaccardSimilarity := jaccardIndex(processed1, processed2)
fmt.Printf("Jaccard相似度: %.4f (%.2f%%)\n", jaccardSimilarity, jaccardSimilarity*100)
}
// Jaccard相似度计算
func jaccardIndex(text1, text2 string) float64 {
words1 := strings.Fields(text1)
words2 := strings.Fields(text2)
set1 := make(map[string]bool)
for _, word := range words1 {
set1[word] = true
}
set2 := make(map[string]bool)
for _, word := range words2 {
set2[word] = true
}
// 计算交集
intersection := 0
for word := range set1 {
if set2[word] {
intersection++
}
}
// 计算并集
union := len(set1) + len(set2) - intersection
if union == 0 {
return 0.0
}
return float64(intersection) / float64(union)
}
对于1000组数据的批量处理,可以这样实现:
type StringPair struct {
Str1 string
Str2 string
}
func batchProcess(pairs []StringPair) []float64 {
results := make([]float64, len(pairs))
for i, pair := range pairs {
processed1 := preprocessText(pair.Str1)
processed2 := preprocessText(pair.Str2)
vec1 := buildWordVector(processed1)
vec2 := buildWordVector(processed2)
// 使用加权相似度
cosineSim := cosineSimilarity(vec1, vec2)
keySim := keyInfoSimilarity(pair.Str1, pair.Str2)
// 综合评分(可根据需要调整权重)
results[i] = 0.7*cosineSim + 0.3*keySim
}
return results
}
func keyInfoSimilarity(str1, str2 string) float64 {
key1 := extractKeyInfo(str1)
key2 := extractKeyInfo(str2)
vec1 := buildWordVector(key1)
vec2 := buildWordVector(key2)
return cosineSimilarity(vec1, vec2)
}
这个方案通过以下改进提高匹配准确度:
- 文本预处理标准化
- 使用余弦相似度考虑词频权重
- 提取关键信息(尺寸、材料、标准等)进行重点匹配
- 结合多种相似度算法
- 过滤停用词减少噪声
对于你的示例数据,这个方案应该能给出更合理的匹配度评分。

