Migreate code

willzhen · willzhen · commit ca79c84bda73 · 2023-04-24T17:21:56.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,4 @@
 
 # Go workspace file
 go.work
+go.sum
diff --git a/aho_automaton.go b/aho_automaton.go
@@ -0,0 +1,102 @@
+package text
+
+import "github.com/eapache/queue"
+
+// TrieNode ac自动机节点
+type TrieNode struct {
+	value rune
+	next  map[rune]*TrieNode
+	fail  *TrieNode
+	emit  string
+}
+
+func newNode(ch rune) (node *TrieNode) {
+	node = new(TrieNode)
+	node.value = ch
+	node.next = map[rune]*TrieNode{}
+	return node
+}
+
+// AcTrie ac自动机匹配字符串算法
+type AcTrie struct {
+	root *TrieNode
+}
+
+// Search 返回匹配的字符串
+func (ac *AcTrie) Search(s string) (list []string, index []int) {
+	node := ac.root
+	for i, c := range []rune(s) {
+		matched := true
+		for {
+			_, ok := node.next[c]
+			if ok {
+				break
+			}
+			if node.fail == nil {
+				matched = false
+				node = ac.root
+				break
+			}
+			node = node.fail
+		}
+		if !matched {
+			continue
+		}
+		node = node.next[c]
+		p := node
+		for p != nil {
+			if p.emit != "" {
+				list = append(list, p.emit)
+				index = append(index, i+1)
+			}
+			p = p.fail
+		}
+	}
+	return list, index
+}
+
+//BuildAcTrie 构建一个 ac 自动机
+func BuildAcTrie(words []string) (acTrie *AcTrie) {
+	acTrie = new(AcTrie)
+	acTrie.root = newNode(rune('r'))
+	for _, word := range words {
+		node := acTrie.root
+		for _, ch := range []rune(word) {
+			if _, ok := node.next[ch]; !ok {
+				node.next[ch] = newNode(ch)
+			}
+			node = node.next[ch]
+		}
+		node.emit = word
+	}
+	queue := queue.New()
+	queue.Add([]*TrieNode{acTrie.root, nil})
+	for queue.Length() > 0 {
+		nodeParent := queue.Remove().([]*TrieNode)
+		curr, parent := nodeParent[0], nodeParent[1]
+		for _, sub := range curr.next {
+			queue.Add([]*TrieNode{sub, curr})
+		}
+		if parent == nil {
+			continue
+		}
+		if parent == acTrie.root {
+			curr.fail = acTrie.root
+		} else {
+			fail := parent.fail
+			for fail != nil {
+				_, ok := fail.next[curr.value]
+				if ok {
+					break
+				}
+				fail = fail.fail
+			}
+			if fail != nil {
+				curr.fail = fail.next[curr.value]
+			} else {
+				curr.fail = acTrie.root
+			}
+		}
+	}
+	return acTrie
+}
diff --git a/edit_distance.go b/edit_distance.go
@@ -0,0 +1,63 @@
+package text
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+// Levenshtein 文本编辑距离
+func Levenshtein(word1, word2 []rune) int {
+	/*
+		计算编辑距离
+		Args
+				word1: 第一个文本
+				word2: 第二个文本
+		Returns
+				两个文本的编辑距离
+	*/
+	if len(word1) == 0 || len(word2) == 0 {
+		return max(len(word1), len(word2))
+	}
+	tmp := []int{}
+	for i := 0; i < len(word2)+1; i++ {
+		tmp = append(tmp, i)
+	}
+	value := 0
+	for i := range word1 {
+		tmp[0] = i + 1
+		last := i
+		for j := range word2 {
+			if word1[i] == word2[j] {
+				value = last
+			} else {
+				value = 1 + min(last, min(tmp[j], tmp[j+1]))
+			}
+			last = tmp[j+1]
+			tmp[j+1] = value
+		}
+	}
+	return value
+}
+
+// TextSim 计算文本的相识度
+func TextSim(str1, str2 string) float32 {
+	// 需要把 string 转换成 rune
+	s1 := []rune(str1)
+	s2 := []rune(str2)
+	if len(s1) == 0 && len(s2) == 0 {
+		return 1.0
+	}
+	n := Levenshtein(s1, s2)
+	maxn := max(len(s1), len(s2))
+	l1 := 1.0 - float32(n)/float32(maxn)
+	return l1
+}
diff --git a/go.mod b/go.mod
@@ -0,0 +1,5 @@
+module github.com/memory-overflow/go-text-algorithm
+
+go 1.16
+
+require github.com/eapache/queue v1.1.0
diff --git a/readme.md b/readme.md
@@ -0,0 +1,91 @@
+- [text 模块](#text-模块)
+  - [SliceSame](#slicesame)
+  - [Aho-Corasick automaton](#aho-corasick-automaton)
+  - [计算文本编辑距离](#计算文本编辑距离)
+  - [计算文本相似度](#计算文本相似度)
+
+# text 模块
+golang 里面的 strings 库已经有了很多丰富的字符串处理功能，但是都是偏向于基础处理。
+
+text模块提供了一些字符串处理相关的算法能力。
+
+## SliceSame
+- SliceSame——判断两个字符串数字是否相同。
+
+example: [TestSliceSmae](https://github.com/memory-overflow/go-common-library/blob/main/text/text_test.go#L29)
+```go
+import (
+  "testing"
+
+  "github.com/memory-overflow/go-common-library/text"
+)
+
+func TestSliceSmae(t *testing.T) {
+	a := []string{"3", "2", "1"}
+	same := text.SliceSame(a, a)
+	t.Logf("is same: %v", same)
+  // test can not change order of a
+	t.Log(a)
+}
+```
+
+## Aho-Corasick automaton
+ac 自动机是一种多模式串的匹配算法。
+
+一个常见的例子就是给出 n 个单词，再给出一段包含 m 个字符的文章，让你找出有多少个单词在文章里出现过。
+
+比较容易想到的做法是，调用 n 次 `strings.Contains(s, xxx)`。假设 n 个单词平局长度为 k, 这样处理的算法时间复杂度为 O(n * k * m)。而使用 ac 自动机可以加速上述过程，整体算法时间复杂度只需要 O(n*k + m)。
+
+example: [TestActrie](https://github.com/memory-overflow/go-common-library/blob/main/text/text_test.go#L9)
+```go
+import (
+  "testing"
+
+  "github.com/memory-overflow/go-common-library/text"
+)
+
+func TestActrie(t *testing.T) {
+  // 在字符串 "哈哈哈哈23434dfgdd" 中找出所有 "哈哈哈", "234"，"dfg" 出现的位置。
+  // 使用模式串构建一个 ac 自动机
+  ac := text.BuildAcTrie([]string{"哈哈哈", "234", "dfg"})
+  // 匹配母串
+  list, index := ac.Search("哈哈哈哈23434dfgdd")
+  for i, l := range list {
+    t.Log(l, index[i])
+  }
+}
+```
+
+## 计算文本编辑距离
+编辑距离(Edit Distance)：是一个度量两个字符序列之间差异的字符串度量标准，两个单词之间的编辑距离是将一个单词转换为另一个单词所需的单字符编辑（插入、删除或替换）的最小数量。一般来说，编辑距离越小，两个串的相似度越大。
+
+example: [TestLevenshtein](https://github.com/memory-overflow/go-common-library/blob/main/text/text_test.go#L24)
+```go
+import (
+  "testing"
+
+  "github.com/memory-overflow/go-common-library/text"
+)
+
+func TestLevenshtein(t *testing.T) {
+	dist := text.Levenshtein([]rune("编辑距离测试"), []rune("测试一下距离"))
+	t.Logf("dist: %d", dist)
+}
+```
+
+## 计算文本相似度
+通过编辑距离，计算两个文本的相似度。
+
+example: [TestTextSim](https://github.com/memory-overflow/go-common-library/blob/main/text/text_test.go#L17)
+```go
+import (
+  "testing"
+
+  "github.com/memory-overflow/go-common-library/text"
+)
+
+func TestTextSim(t *testing.T) {
+	sim := text.TextSim("编辑距离测试", "测试一下距离")
+  t.Logf("sim: %f", sim)
+}
+```
diff --git a/slice_same.go b/slice_same.go
@@ -0,0 +1,21 @@
+package text
+
+import "sort"
+
+// SliceSame 对于两个列表的值是否一样
+func SliceSame(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	tmpa, tempb := []string{}, []string{}
+	copy(tmpa, a)
+	copy(tempb, b)
+	sort.Strings(tmpa)
+	sort.Strings(tempb)
+	for i := 0; i < len(tmpa); i++ {
+		if tmpa[i] != tempb[i] {
+			return false
+		}
+	}
+	return true
+}
diff --git a/text_test.go b/text_test.go
@@ -0,0 +1,35 @@
+package text_test
+
+import (
+	"testing"
+
+	"github.com/memory-overflow/go-text-algorithm"
+)
+
+func TestActrie(t *testing.T) {
+	ac := text.BuildAcTrie([]string{"哈哈哈", "234", "dfg"})
+	list, index := ac.Search("哈哈哈哈23434dfgdd")
+	for i, l := range list {
+		t.Log(l, index[i])
+	}
+}
+
+func TestTextSim(t *testing.T) {
+	sim := text.TextSim("编辑距离测试", "测试一下距离")
+	if sim != 0 {
+		t.Error("Failed")
+	}
+}
+
+func TestLevenshtein(t *testing.T) {
+	dist := text.Levenshtein([]rune("编辑距离测试"), []rune("测试一下距离"))
+	t.Logf("dist: %d", dist)
+}
+
+func TestSliceSmae(t *testing.T) {
+	a := []string{"3", "2", "1"}
+	same := text.SliceSame(a, a)
+	t.Logf("is same: %v", same)
+	// test can not change order of a
+	t.Log(a)
+}

Original file line number	Diff line number	Diff line change
`@@ -19,3 +19,4 @@`
`19`	`19`
`20`	`20`	`# Go workspace file`
`21`	`21`	`go.work`
	`22`	`+go.sum`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +module github.com/memory-overflow/go-text-algorithm
++
 +go 1.16
++
 +require github.com/eapache/queue v1.1.0