Golang字符串相似度检测与告警实现方案

Golang字符串相似度检测与告警实现方案大家好，我想知道是否有任何方法，给定一个输入字符串如 this is col 和一个包含以下内容的文本文件：

this is cool (a string)
not good (another string)
beautiful (another string)
etc.

能够找出所有与输入字符串“相似”的字符串，因此在这个例子中，它将返回第一个字符串 this is cool，类似于在 Linux 中你会看到 x was not found did you mean y? 这样的提示。谢谢

phonegap100 1楼

莱文斯坦距离？ https://www.google.com/search?q=Levenshtein+golang

更多关于Golang字符串相似度检测与告警实现方案的实战系列教程也可以访问 https://www.itying.com/category-94-b0.html

nodeper 2楼

在Golang中实现字符串相似度检测与告警，可以使用编辑距离（Levenshtein距离）算法配合阈值判断。以下是具体实现方案：

package main

import (
	"fmt"
	"io/ioutil"
	"strings"
)

// 计算Levenshtein距离
func levenshteinDistance(s1, s2 string) int {
	m, n := len(s1), len(s2)
	dp := make([][]int, m+1)
	for i := range dp {
		dp[i] = make([]int, n+1)
		dp[i][0] = i
	}
	for j := range dp[0] {
		dp[0][j] = j
	}

	for i := 1; i <= m; i++ {
		for j := 1; j <= n; j++ {
			cost := 0
			if s1[i-1] != s2[j-1] {
				cost = 1
			}
			dp[i][j] = min(
				dp[i-1][j]+1,      // 删除
				dp[i][j-1]+1,      // 插入
				dp[i-1][j-1]+cost, // 替换
			)
		}
	}
	return dp[m][n]
}

func min(a, b, c int) int {
	if a < b && a < c {
		return a
	}
	if b < c {
		return b
	}
	return c
}

// 计算相似度百分比
func similarity(s1, s2 string) float64 {
	distance := levenshteinDistance(s1, s2)
	maxLen := max(len(s1), len(s2))
	if maxLen == 0 {
		return 100.0
	}
	return (1 - float64(distance)/float64(maxLen)) * 100
}

func max(a, b int) int {
	if a > b {
		return a
	}
	return b
}

// 查找相似字符串
func findSimilarStrings(input string, candidates []string, threshold float64) []string {
	var results []string
	for _, candidate := range candidates {
		if similarity(input, candidate) >= threshold {
			results = append(results, candidate)
		}
	}
	return results
}

func main() {
	// 从文件读取候选字符串
	content, err := ioutil.ReadFile("strings.txt")
	if err != nil {
		panic(err)
	}
	
	candidates := strings.Split(strings.TrimSpace(string(content)), "\n")
	input := "this is col"
	threshold := 70.0 // 相似度阈值，可调整
	
	similarStrings := findSimilarStrings(input, candidates, threshold)
	
	fmt.Printf("输入: %s\n", input)
	fmt.Printf("相似字符串(阈值%.1f%%):\n", threshold)
	for _, s := range similarStrings {
		sim := similarity(input, s)
		fmt.Printf("- %s (相似度: %.1f%%)\n", s, sim)
	}
}

对于更高效的实现，可以使用优化的算法版本：

// 优化版Levenshtein距离计算（内存优化）
func levenshteinDistanceOptimized(s1, s2 string) int {
	if len(s1) < len(s2) {
		s1, s2 = s2, s1
	}
	
	prev := make([]int, len(s2)+1)
	curr := make([]int, len(s2)+1)
	
	for j := 0; j <= len(s2); j++ {
		prev[j] = j
	}
	
	for i := 1; i <= len(s1); i++ {
		curr[0] = i
		for j := 1; j <= len(s2); j++ {
			cost := 0
			if s1[i-1] != s2[j-1] {
				cost = 1
			}
			curr[j] = min(
				prev[j]+1,
				curr[j-1]+1,
				prev[j-1]+cost,
			)
		}
		prev, curr = curr, prev
	}
	return prev[len(s2)]
}

如果需要处理大量字符串，可以并行计算：

import "sync"

// 并行查找相似字符串
func findSimilarStringsParallel(input string, candidates []string, threshold float64) []string {
	var results []string
	var mu sync.Mutex
	var wg sync.WaitGroup
	
	chunkSize := 100
	for i := 0; i < len(candidates); i += chunkSize {
		end := i + chunkSize
		if end > len(candidates) {
			end = len(candidates)
		}
		
		wg.Add(1)
		go func(chunk []string) {
			defer wg.Done()
			var localResults []string
			for _, candidate := range chunk {
				if similarity(input, candidate) >= threshold {
					localResults = append(localResults, candidate)
				}
			}
			
			mu.Lock()
			results = append(results, localResults...)
			mu.Unlock()
		}(candidates[i:end])
	}
	
	wg.Wait()
	return results
}

对于实际应用场景，可以封装成完整的检测服务：

type SimilarityDetector struct {
	candidates []string
	threshold  float64
}

func NewSimilarityDetector(filePath string, threshold float64) (*SimilarityDetector, error) {
	content, err := ioutil.ReadFile(filePath)
	if err != nil {
		return nil, err
	}
	
	candidates := strings.Split(strings.TrimSpace(string(content)), "\n")
	return &SimilarityDetector{
		candidates: candidates,
		threshold:  threshold,
	}, nil
}

func (sd *SimilarityDetector) FindSimilar(input string) []SimilarityResult {
	var results []SimilarityResult
	for _, candidate := range sd.candidates {
		sim := similarity(input, candidate)
		if sim >= sd.threshold {
			results = append(results, SimilarityResult{
				String:     candidate,
				Similarity: sim,
			})
		}
	}
	return results
}

type SimilarityResult struct {
	String     string
	Similarity float64
}

// 使用示例
func main() {
	detector, err := NewSimilarityDetector("strings.txt", 70.0)
	if err != nil {
		panic(err)
	}
	
	input := "this is col"
	results := detector.FindSimilar(input)
	
	if len(results) > 0 {
		fmt.Printf("未找到 '%s'，您是否想找：\n", input)
		for _, result := range results {
			fmt.Printf("- %s (相似度: %.1f%%)\n", result.String, result.Similarity)
		}
	}
}

这个实现方案提供了：

Levenshtein距离算法计算字符串相似度
可配置的相似度阈值
优化版本减少内存使用
并行处理支持大量字符串
完整的服务封装

可以根据实际需求调整阈值参数，通常70-80%的相似度阈值能有效平衡准确性和召回率。