481 lines
9.9 KiB
Go
481 lines
9.9 KiB
Go
// Package ac provides an implementation of the Aho-Corasick string matching
|
|
// algorithm. Throughout this code []byte is referred to
|
|
// as a blice.
|
|
//
|
|
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
|
|
//
|
|
// Copyright (c) 2013 CloudFlare, Inc.
|
|
//
|
|
// Originally from https://github.com/cloudflare/ahocorasick
|
|
package acascii
|
|
|
|
import (
|
|
"container/list"
|
|
"errors"
|
|
)
|
|
|
|
const maxchar = 128
|
|
|
|
// ErrNotASCII is returned when the dictionary input is not ASCII
|
|
var ErrNotASCII = errors.New("non-ASCII input")
|
|
|
|
// A node in the trie structure used to implement Aho-Corasick
|
|
type node struct {
|
|
root bool // true if this is the root
|
|
|
|
output bool // True means this node represents a blice that should
|
|
// be output when matching
|
|
|
|
b string // The path at this node
|
|
|
|
index int // index into original dictionary if output is true
|
|
|
|
counter int // Set to the value of the Matcher.counter when a
|
|
// match is output to prevent duplicate output
|
|
|
|
// The use of fixed size arrays is space-inefficient but fast for
|
|
// lookups.
|
|
|
|
child [maxchar]*node // A non-nil entry in this array means that the
|
|
// index represents a byte value which can be
|
|
// appended to the current node. Blices in the
|
|
// trie are built up byte by byte through these
|
|
// child node pointers.
|
|
|
|
fails [maxchar]*node // Where to fail to (by following the fail
|
|
// pointers) for each possible byte
|
|
|
|
suffix *node // Pointer to the longest possible strict suffix of
|
|
// this node
|
|
|
|
fail *node // Pointer to the next node which is in the dictionary
|
|
// which can be reached from here following suffixes. Called fail
|
|
// because it is used to fallback in the trie when a match fails.
|
|
}
|
|
|
|
// Matcher contains a list of blices to match against
|
|
type Matcher struct {
|
|
counter int // Counts the number of matches done, and is used to
|
|
// prevent output of multiple matches of the same string
|
|
trie []node // preallocated block of memory containing all the
|
|
// nodes
|
|
extent int // offset into trie that is currently free
|
|
root *node // Points to trie[0]
|
|
}
|
|
|
|
// findBlice looks for a blice in the trie starting from the root and
|
|
// returns a pointer to the node representing the end of the blice. If
|
|
// the blice is not found it returns nil.
|
|
func (m *Matcher) findBlice(b string) *node {
|
|
n := &m.trie[0]
|
|
|
|
for n != nil && len(b) > 0 {
|
|
n = n.child[int(b[0])]
|
|
b = b[1:]
|
|
}
|
|
|
|
return n
|
|
}
|
|
|
|
// getFreeNode: gets a free node structure from the Matcher's trie
|
|
// pool and updates the extent to point to the next free node.
|
|
func (m *Matcher) getFreeNode() *node {
|
|
m.extent++
|
|
|
|
if m.extent == 1 {
|
|
m.root = &m.trie[0]
|
|
m.root.root = true
|
|
}
|
|
|
|
return &m.trie[m.extent-1]
|
|
}
|
|
|
|
// buildTrie builds the fundamental trie structure from a set of
|
|
// blices.
|
|
func (m *Matcher) buildTrie(dictionary [][]byte) error {
|
|
|
|
// Work out the maximum size for the trie (all dictionary entries
|
|
// are distinct plus the root). This is used to preallocate memory
|
|
// for it.
|
|
|
|
max := 1
|
|
for _, blice := range dictionary {
|
|
max += len(blice)
|
|
}
|
|
m.trie = make([]node, max)
|
|
|
|
// Calling this an ignoring its argument simply allocated
|
|
// m.trie[0] which will be the root element
|
|
|
|
m.getFreeNode()
|
|
|
|
// This loop builds the nodes in the trie by following through
|
|
// each dictionary entry building the children pointers.
|
|
|
|
for _, blice := range dictionary {
|
|
n := m.root
|
|
for i, b := range blice {
|
|
idx := int(b)
|
|
if idx >= maxchar {
|
|
return ErrNotASCII
|
|
}
|
|
c := n.child[idx]
|
|
|
|
if c == nil {
|
|
c = m.getFreeNode()
|
|
n.child[idx] = c
|
|
c.b = string(blice[0 : i+1])
|
|
|
|
// Nodes directly under the root node will have the
|
|
// root as their fail point as there are no suffixes
|
|
// possible.
|
|
|
|
if i == 0 {
|
|
c.fail = m.root
|
|
}
|
|
|
|
c.suffix = m.root
|
|
}
|
|
|
|
n = c
|
|
}
|
|
|
|
// The last value of n points to the node representing a
|
|
// dictionary entry
|
|
|
|
n.output = true
|
|
n.index = len(blice)
|
|
}
|
|
|
|
l := new(list.List)
|
|
l.PushBack(m.root)
|
|
|
|
for l.Len() > 0 {
|
|
n := l.Remove(l.Front()).(*node)
|
|
|
|
for i := 0; i < maxchar; i++ {
|
|
c := n.child[i]
|
|
if c != nil {
|
|
l.PushBack(c)
|
|
|
|
for j := 1; j < len(c.b); j++ {
|
|
c.fail = m.findBlice(c.b[j:])
|
|
if c.fail != nil {
|
|
break
|
|
}
|
|
}
|
|
|
|
if c.fail == nil {
|
|
c.fail = m.root
|
|
}
|
|
|
|
for j := 1; j < len(c.b); j++ {
|
|
s := m.findBlice(c.b[j:])
|
|
if s != nil && s.output {
|
|
c.suffix = s
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for i := 0; i < m.extent; i++ {
|
|
for c := 0; c < maxchar; c++ {
|
|
n := &m.trie[i]
|
|
for n.child[c] == nil && !n.root {
|
|
n = n.fail
|
|
}
|
|
|
|
m.trie[i].fails[c] = n
|
|
}
|
|
}
|
|
|
|
m.trie = m.trie[:m.extent]
|
|
return nil
|
|
}
|
|
|
|
// buildTrieString builds the fundamental trie structure from a []string
|
|
func (m *Matcher) buildTrieString(dictionary []string) error {
|
|
|
|
// Work out the maximum size for the trie (all dictionary entries
|
|
// are distinct plus the root). This is used to preallocate memory
|
|
// for it.
|
|
|
|
max := 1
|
|
for _, blice := range dictionary {
|
|
max += len(blice)
|
|
|
|
}
|
|
m.trie = make([]node, max)
|
|
|
|
// Calling this an ignoring its argument simply allocated
|
|
// m.trie[0] which will be the root element
|
|
|
|
m.getFreeNode()
|
|
|
|
// This loop builds the nodes in the trie by following through
|
|
// each dictionary entry building the children pointers.
|
|
|
|
for _, blice := range dictionary {
|
|
n := m.root
|
|
for i := 0; i < len(blice); i++ {
|
|
index := int(blice[i])
|
|
if index >= maxchar {
|
|
return ErrNotASCII
|
|
}
|
|
b := int(blice[i])
|
|
c := n.child[b]
|
|
if c == nil {
|
|
c = m.getFreeNode()
|
|
n.child[b] = c
|
|
c.b = blice[0 : i+1]
|
|
|
|
// Nodes directly under the root node will have the
|
|
// root as their fail point as there are no suffixes
|
|
// possible.
|
|
|
|
if i == 0 {
|
|
c.fail = m.root
|
|
}
|
|
|
|
c.suffix = m.root
|
|
}
|
|
|
|
n = c
|
|
}
|
|
|
|
// The last value of n points to the node representing a
|
|
// dictionary entry
|
|
|
|
n.output = true
|
|
n.index = len(blice)
|
|
}
|
|
|
|
l := new(list.List)
|
|
l.PushBack(m.root)
|
|
|
|
for l.Len() > 0 {
|
|
n := l.Remove(l.Front()).(*node)
|
|
|
|
for i := 0; i < maxchar; i++ {
|
|
c := n.child[i]
|
|
if c != nil {
|
|
l.PushBack(c)
|
|
|
|
for j := 1; j < len(c.b); j++ {
|
|
c.fail = m.findBlice(c.b[j:])
|
|
if c.fail != nil {
|
|
break
|
|
}
|
|
}
|
|
|
|
if c.fail == nil {
|
|
c.fail = m.root
|
|
}
|
|
|
|
for j := 1; j < len(c.b); j++ {
|
|
s := m.findBlice(c.b[j:])
|
|
if s != nil && s.output {
|
|
c.suffix = s
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for i := 0; i < m.extent; i++ {
|
|
for c := 0; c < maxchar; c++ {
|
|
n := &m.trie[i]
|
|
for n.child[c] == nil && !n.root {
|
|
n = n.fail
|
|
}
|
|
|
|
m.trie[i].fails[c] = n
|
|
}
|
|
}
|
|
|
|
m.trie = m.trie[:m.extent]
|
|
return nil
|
|
}
|
|
|
|
// Compile creates a new Matcher using a list of []byte
|
|
func Compile(dictionary [][]byte) (*Matcher, error) {
|
|
m := new(Matcher)
|
|
if err := m.buildTrie(dictionary); err != nil {
|
|
return nil, err
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
// MustCompile returns a Matcher or panics
|
|
func MustCompile(dictionary [][]byte) *Matcher {
|
|
m, err := Compile(dictionary)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return m
|
|
}
|
|
|
|
// CompileString creates a new Matcher used to match against a set
|
|
// of strings (this is a helper to make initialization easy)
|
|
func CompileString(dictionary []string) (*Matcher, error) {
|
|
m := new(Matcher)
|
|
if err := m.buildTrieString(dictionary); err != nil {
|
|
return nil, err
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
// MustCompileString returns a Matcher or panics
|
|
func MustCompileString(dictionary []string) *Matcher {
|
|
m, err := CompileString(dictionary)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return m
|
|
}
|
|
|
|
// FindAll searches in for blices and returns all the blices found
|
|
// in the original dictionary
|
|
func (m *Matcher) FindAll(in []byte) [][]byte {
|
|
m.counter++
|
|
var hits [][]byte
|
|
|
|
n := m.root
|
|
|
|
for idx, b := range in {
|
|
c := int(b)
|
|
if c >= maxchar {
|
|
c = 0
|
|
}
|
|
if !n.root && n.child[c] == nil {
|
|
n = n.fails[c]
|
|
}
|
|
|
|
if n.child[c] != nil {
|
|
f := n.child[c]
|
|
n = f
|
|
|
|
if f.output && f.counter != m.counter {
|
|
hits = append(hits, in[idx-f.index+1:idx+1])
|
|
f.counter = m.counter
|
|
}
|
|
|
|
for !f.suffix.root {
|
|
f = f.suffix
|
|
if f.counter != m.counter {
|
|
hits = append(hits, in[idx-f.index+1:idx+1])
|
|
f.counter = m.counter
|
|
} else {
|
|
// There's no point working our way up the
|
|
// suffixes if it's been done before for this call
|
|
// to Match. The matches are already in hits.
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return hits
|
|
}
|
|
|
|
// FindAllString searches in for blices and returns all the blices (as strings) found as
|
|
// in the original dictionary
|
|
func (m *Matcher) FindAllString(in string) []string {
|
|
m.counter++
|
|
var hits []string
|
|
|
|
n := m.root
|
|
slen := len(in)
|
|
for idx := 0; idx < slen; idx++ {
|
|
c := int(in[idx])
|
|
if c >= maxchar {
|
|
c = 0
|
|
}
|
|
if !n.root && n.child[c] == nil {
|
|
n = n.fails[c]
|
|
}
|
|
|
|
if n.child[c] != nil {
|
|
f := n.child[c]
|
|
n = f
|
|
|
|
if f.output && f.counter != m.counter {
|
|
hits = append(hits, in[idx-f.index+1:idx+1])
|
|
f.counter = m.counter
|
|
}
|
|
|
|
for !f.suffix.root {
|
|
f = f.suffix
|
|
if f.counter != m.counter {
|
|
hits = append(hits, in[idx-f.index+1:idx+1])
|
|
f.counter = m.counter
|
|
} else {
|
|
// There's no point working our way up the
|
|
// suffixes if it's been done before for this call
|
|
// to Match. The matches are already in hits.
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return hits
|
|
}
|
|
|
|
// Match returns true if the input slice contains any subslices
|
|
func (m *Matcher) Match(in []byte) bool {
|
|
n := m.root
|
|
for _, b := range in {
|
|
c := int(b)
|
|
if c > maxchar {
|
|
c = 0
|
|
}
|
|
if !n.root && n.child[c] == nil {
|
|
n = n.fails[c]
|
|
}
|
|
|
|
if n.child[c] != nil {
|
|
n = n.child[c]
|
|
|
|
if n.output {
|
|
return true
|
|
}
|
|
|
|
for !n.suffix.root {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// MatchString returns true if the input slice contains any subslices
|
|
func (m *Matcher) MatchString(in string) bool {
|
|
n := m.root
|
|
slen := len(in)
|
|
for idx := 0; idx < slen; idx++ {
|
|
c := int(in[idx])
|
|
if c >= maxchar {
|
|
c = 0
|
|
}
|
|
if !n.root && n.child[c] == nil {
|
|
n = n.fails[c]
|
|
}
|
|
if n.child[c] != nil {
|
|
n = n.child[c]
|
|
|
|
if n.output {
|
|
return true
|
|
}
|
|
|
|
for !n.suffix.root {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|