Golang字符串相似度检测与告警实现方案
Golang字符串相似度检测与告警实现方案
大家好,
我想知道是否有任何方法,给定一个输入字符串如 this is col 和一个包含以下内容的文本文件:
this is cool (a string)
not good (another string)
beautiful (another string)
etc.
能够找出所有与输入字符串“相似”的字符串,因此在这个例子中,它将返回第一个字符串 this is cool,类似于在 Linux 中你会看到 x was not found did you mean y? 这样的提示。
谢谢
2 回复
更多关于Golang字符串相似度检测与告警实现方案的实战系列教程也可以访问 https://www.itying.com/category-94-b0.html
在Golang中实现字符串相似度检测与告警,可以使用编辑距离(Levenshtein距离)算法配合阈值判断。以下是具体实现方案:
package main
import (
"fmt"
"io/ioutil"
"strings"
)
// 计算Levenshtein距离
func levenshteinDistance(s1, s2 string) int {
m, n := len(s1), len(s2)
dp := make([][]int, m+1)
for i := range dp {
dp[i] = make([]int, n+1)
dp[i][0] = i
}
for j := range dp[0] {
dp[0][j] = j
}
for i := 1; i <= m; i++ {
for j := 1; j <= n; j++ {
cost := 0
if s1[i-1] != s2[j-1] {
cost = 1
}
dp[i][j] = min(
dp[i-1][j]+1, // 删除
dp[i][j-1]+1, // 插入
dp[i-1][j-1]+cost, // 替换
)
}
}
return dp[m][n]
}
func min(a, b, c int) int {
if a < b && a < c {
return a
}
if b < c {
return b
}
return c
}
// 计算相似度百分比
func similarity(s1, s2 string) float64 {
distance := levenshteinDistance(s1, s2)
maxLen := max(len(s1), len(s2))
if maxLen == 0 {
return 100.0
}
return (1 - float64(distance)/float64(maxLen)) * 100
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
// 查找相似字符串
func findSimilarStrings(input string, candidates []string, threshold float64) []string {
var results []string
for _, candidate := range candidates {
if similarity(input, candidate) >= threshold {
results = append(results, candidate)
}
}
return results
}
func main() {
// 从文件读取候选字符串
content, err := ioutil.ReadFile("strings.txt")
if err != nil {
panic(err)
}
candidates := strings.Split(strings.TrimSpace(string(content)), "\n")
input := "this is col"
threshold := 70.0 // 相似度阈值,可调整
similarStrings := findSimilarStrings(input, candidates, threshold)
fmt.Printf("输入: %s\n", input)
fmt.Printf("相似字符串(阈值%.1f%%):\n", threshold)
for _, s := range similarStrings {
sim := similarity(input, s)
fmt.Printf("- %s (相似度: %.1f%%)\n", s, sim)
}
}
对于更高效的实现,可以使用优化的算法版本:
// 优化版Levenshtein距离计算(内存优化)
func levenshteinDistanceOptimized(s1, s2 string) int {
if len(s1) < len(s2) {
s1, s2 = s2, s1
}
prev := make([]int, len(s2)+1)
curr := make([]int, len(s2)+1)
for j := 0; j <= len(s2); j++ {
prev[j] = j
}
for i := 1; i <= len(s1); i++ {
curr[0] = i
for j := 1; j <= len(s2); j++ {
cost := 0
if s1[i-1] != s2[j-1] {
cost = 1
}
curr[j] = min(
prev[j]+1,
curr[j-1]+1,
prev[j-1]+cost,
)
}
prev, curr = curr, prev
}
return prev[len(s2)]
}
如果需要处理大量字符串,可以并行计算:
import "sync"
// 并行查找相似字符串
func findSimilarStringsParallel(input string, candidates []string, threshold float64) []string {
var results []string
var mu sync.Mutex
var wg sync.WaitGroup
chunkSize := 100
for i := 0; i < len(candidates); i += chunkSize {
end := i + chunkSize
if end > len(candidates) {
end = len(candidates)
}
wg.Add(1)
go func(chunk []string) {
defer wg.Done()
var localResults []string
for _, candidate := range chunk {
if similarity(input, candidate) >= threshold {
localResults = append(localResults, candidate)
}
}
mu.Lock()
results = append(results, localResults...)
mu.Unlock()
}(candidates[i:end])
}
wg.Wait()
return results
}
对于实际应用场景,可以封装成完整的检测服务:
type SimilarityDetector struct {
candidates []string
threshold float64
}
func NewSimilarityDetector(filePath string, threshold float64) (*SimilarityDetector, error) {
content, err := ioutil.ReadFile(filePath)
if err != nil {
return nil, err
}
candidates := strings.Split(strings.TrimSpace(string(content)), "\n")
return &SimilarityDetector{
candidates: candidates,
threshold: threshold,
}, nil
}
func (sd *SimilarityDetector) FindSimilar(input string) []SimilarityResult {
var results []SimilarityResult
for _, candidate := range sd.candidates {
sim := similarity(input, candidate)
if sim >= sd.threshold {
results = append(results, SimilarityResult{
String: candidate,
Similarity: sim,
})
}
}
return results
}
type SimilarityResult struct {
String string
Similarity float64
}
// 使用示例
func main() {
detector, err := NewSimilarityDetector("strings.txt", 70.0)
if err != nil {
panic(err)
}
input := "this is col"
results := detector.FindSimilar(input)
if len(results) > 0 {
fmt.Printf("未找到 '%s',您是否想找:\n", input)
for _, result := range results {
fmt.Printf("- %s (相似度: %.1f%%)\n", result.String, result.Similarity)
}
}
}
这个实现方案提供了:
- Levenshtein距离算法计算字符串相似度
- 可配置的相似度阈值
- 优化版本减少内存使用
- 并行处理支持大量字符串
- 完整的服务封装
可以根据实际需求调整阈值参数,通常70-80%的相似度阈值能有效平衡准确性和召回率。

