gohttp/vendor/github.com/signalsciences/ac/acascii/ac.go

481 lines
9.9 KiB
Go
Raw Permalink Normal View History

2023-12-21 18:36:51 +08:00
// Package ac provides an implementation of the Aho-Corasick string matching
// algorithm. Throughout this code []byte is referred to
// as a blice.
//
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
//
// Copyright (c) 2013 CloudFlare, Inc.
//
// Originally from https://github.com/cloudflare/ahocorasick
package acascii
import (
"container/list"
"errors"
)
const maxchar = 128
// ErrNotASCII is returned when the dictionary input is not ASCII
var ErrNotASCII = errors.New("non-ASCII input")
// A node in the trie structure used to implement Aho-Corasick
type node struct {
root bool // true if this is the root
output bool // True means this node represents a blice that should
// be output when matching
b string // The path at this node
index int // index into original dictionary if output is true
counter int // Set to the value of the Matcher.counter when a
// match is output to prevent duplicate output
// The use of fixed size arrays is space-inefficient but fast for
// lookups.
child [maxchar]*node // A non-nil entry in this array means that the
// index represents a byte value which can be
// appended to the current node. Blices in the
// trie are built up byte by byte through these
// child node pointers.
fails [maxchar]*node // Where to fail to (by following the fail
// pointers) for each possible byte
suffix *node // Pointer to the longest possible strict suffix of
// this node
fail *node // Pointer to the next node which is in the dictionary
// which can be reached from here following suffixes. Called fail
// because it is used to fallback in the trie when a match fails.
}
// Matcher contains a list of blices to match against
type Matcher struct {
counter int // Counts the number of matches done, and is used to
// prevent output of multiple matches of the same string
trie []node // preallocated block of memory containing all the
// nodes
extent int // offset into trie that is currently free
root *node // Points to trie[0]
}
// findBlice looks for a blice in the trie starting from the root and
// returns a pointer to the node representing the end of the blice. If
// the blice is not found it returns nil.
func (m *Matcher) findBlice(b string) *node {
n := &m.trie[0]
for n != nil && len(b) > 0 {
n = n.child[int(b[0])]
b = b[1:]
}
return n
}
// getFreeNode: gets a free node structure from the Matcher's trie
// pool and updates the extent to point to the next free node.
func (m *Matcher) getFreeNode() *node {
m.extent++
if m.extent == 1 {
m.root = &m.trie[0]
m.root.root = true
}
return &m.trie[m.extent-1]
}
// buildTrie builds the fundamental trie structure from a set of
// blices.
func (m *Matcher) buildTrie(dictionary [][]byte) error {
// Work out the maximum size for the trie (all dictionary entries
// are distinct plus the root). This is used to preallocate memory
// for it.
max := 1
for _, blice := range dictionary {
max += len(blice)
}
m.trie = make([]node, max)
// Calling this an ignoring its argument simply allocated
// m.trie[0] which will be the root element
m.getFreeNode()
// This loop builds the nodes in the trie by following through
// each dictionary entry building the children pointers.
for _, blice := range dictionary {
n := m.root
for i, b := range blice {
idx := int(b)
if idx >= maxchar {
return ErrNotASCII
}
c := n.child[idx]
if c == nil {
c = m.getFreeNode()
n.child[idx] = c
c.b = string(blice[0 : i+1])
// Nodes directly under the root node will have the
// root as their fail point as there are no suffixes
// possible.
if i == 0 {
c.fail = m.root
}
c.suffix = m.root
}
n = c
}
// The last value of n points to the node representing a
// dictionary entry
n.output = true
n.index = len(blice)
}
l := new(list.List)
l.PushBack(m.root)
for l.Len() > 0 {
n := l.Remove(l.Front()).(*node)
for i := 0; i < maxchar; i++ {
c := n.child[i]
if c != nil {
l.PushBack(c)
for j := 1; j < len(c.b); j++ {
c.fail = m.findBlice(c.b[j:])
if c.fail != nil {
break
}
}
if c.fail == nil {
c.fail = m.root
}
for j := 1; j < len(c.b); j++ {
s := m.findBlice(c.b[j:])
if s != nil && s.output {
c.suffix = s
break
}
}
}
}
}
for i := 0; i < m.extent; i++ {
for c := 0; c < maxchar; c++ {
n := &m.trie[i]
for n.child[c] == nil && !n.root {
n = n.fail
}
m.trie[i].fails[c] = n
}
}
m.trie = m.trie[:m.extent]
return nil
}
// buildTrieString builds the fundamental trie structure from a []string
func (m *Matcher) buildTrieString(dictionary []string) error {
// Work out the maximum size for the trie (all dictionary entries
// are distinct plus the root). This is used to preallocate memory
// for it.
max := 1
for _, blice := range dictionary {
max += len(blice)
}
m.trie = make([]node, max)
// Calling this an ignoring its argument simply allocated
// m.trie[0] which will be the root element
m.getFreeNode()
// This loop builds the nodes in the trie by following through
// each dictionary entry building the children pointers.
for _, blice := range dictionary {
n := m.root
for i := 0; i < len(blice); i++ {
index := int(blice[i])
if index >= maxchar {
return ErrNotASCII
}
b := int(blice[i])
c := n.child[b]
if c == nil {
c = m.getFreeNode()
n.child[b] = c
c.b = blice[0 : i+1]
// Nodes directly under the root node will have the
// root as their fail point as there are no suffixes
// possible.
if i == 0 {
c.fail = m.root
}
c.suffix = m.root
}
n = c
}
// The last value of n points to the node representing a
// dictionary entry
n.output = true
n.index = len(blice)
}
l := new(list.List)
l.PushBack(m.root)
for l.Len() > 0 {
n := l.Remove(l.Front()).(*node)
for i := 0; i < maxchar; i++ {
c := n.child[i]
if c != nil {
l.PushBack(c)
for j := 1; j < len(c.b); j++ {
c.fail = m.findBlice(c.b[j:])
if c.fail != nil {
break
}
}
if c.fail == nil {
c.fail = m.root
}
for j := 1; j < len(c.b); j++ {
s := m.findBlice(c.b[j:])
if s != nil && s.output {
c.suffix = s
break
}
}
}
}
}
for i := 0; i < m.extent; i++ {
for c := 0; c < maxchar; c++ {
n := &m.trie[i]
for n.child[c] == nil && !n.root {
n = n.fail
}
m.trie[i].fails[c] = n
}
}
m.trie = m.trie[:m.extent]
return nil
}
// Compile creates a new Matcher using a list of []byte
func Compile(dictionary [][]byte) (*Matcher, error) {
m := new(Matcher)
if err := m.buildTrie(dictionary); err != nil {
return nil, err
}
return m, nil
}
// MustCompile returns a Matcher or panics
func MustCompile(dictionary [][]byte) *Matcher {
m, err := Compile(dictionary)
if err != nil {
panic(err)
}
return m
}
// CompileString creates a new Matcher used to match against a set
// of strings (this is a helper to make initialization easy)
func CompileString(dictionary []string) (*Matcher, error) {
m := new(Matcher)
if err := m.buildTrieString(dictionary); err != nil {
return nil, err
}
return m, nil
}
// MustCompileString returns a Matcher or panics
func MustCompileString(dictionary []string) *Matcher {
m, err := CompileString(dictionary)
if err != nil {
panic(err)
}
return m
}
// FindAll searches in for blices and returns all the blices found
// in the original dictionary
func (m *Matcher) FindAll(in []byte) [][]byte {
m.counter++
var hits [][]byte
n := m.root
for idx, b := range in {
c := int(b)
if c >= maxchar {
c = 0
}
if !n.root && n.child[c] == nil {
n = n.fails[c]
}
if n.child[c] != nil {
f := n.child[c]
n = f
if f.output && f.counter != m.counter {
hits = append(hits, in[idx-f.index+1:idx+1])
f.counter = m.counter
}
for !f.suffix.root {
f = f.suffix
if f.counter != m.counter {
hits = append(hits, in[idx-f.index+1:idx+1])
f.counter = m.counter
} else {
// There's no point working our way up the
// suffixes if it's been done before for this call
// to Match. The matches are already in hits.
break
}
}
}
}
return hits
}
// FindAllString searches in for blices and returns all the blices (as strings) found as
// in the original dictionary
func (m *Matcher) FindAllString(in string) []string {
m.counter++
var hits []string
n := m.root
slen := len(in)
for idx := 0; idx < slen; idx++ {
c := int(in[idx])
if c >= maxchar {
c = 0
}
if !n.root && n.child[c] == nil {
n = n.fails[c]
}
if n.child[c] != nil {
f := n.child[c]
n = f
if f.output && f.counter != m.counter {
hits = append(hits, in[idx-f.index+1:idx+1])
f.counter = m.counter
}
for !f.suffix.root {
f = f.suffix
if f.counter != m.counter {
hits = append(hits, in[idx-f.index+1:idx+1])
f.counter = m.counter
} else {
// There's no point working our way up the
// suffixes if it's been done before for this call
// to Match. The matches are already in hits.
break
}
}
}
}
return hits
}
// Match returns true if the input slice contains any subslices
func (m *Matcher) Match(in []byte) bool {
n := m.root
for _, b := range in {
c := int(b)
if c > maxchar {
c = 0
}
if !n.root && n.child[c] == nil {
n = n.fails[c]
}
if n.child[c] != nil {
n = n.child[c]
if n.output {
return true
}
for !n.suffix.root {
return true
}
}
}
return false
}
// MatchString returns true if the input slice contains any subslices
func (m *Matcher) MatchString(in string) bool {
n := m.root
slen := len(in)
for idx := 0; idx < slen; idx++ {
c := int(in[idx])
if c >= maxchar {
c = 0
}
if !n.root && n.child[c] == nil {
n = n.fails[c]
}
if n.child[c] != nil {
n = n.child[c]
if n.output {
return true
}
for !n.suffix.root {
return true
}
}
}
return false
}