The Aho–Corasick Automaton in Haskell
It is common that one might want to match different strings against one single text of length . One can of course apply the KMP algorithm individually, and result an algorithm that runs in time.
Faster algorithms are known. The idea is to build an finite state transducer that can output which strings is the suffix of the string it read. The Aho-Corasick automaton is a compressed version of such transducer, as the size does not depend on the size of the alphabet.
import Control.Arrow (first)
import Data.Function (on)
import Data.List (lookup, partition)
import Data.Maybe (fromMaybe, Maybe (..))
import Data.Monoid (All (..), Monoid, getAll, mappend, mconcat,
mempty)
data Automaton a b = Node {delta :: a -> Automaton a b,
output :: b
}
equivalentClasses :: (a->a->Bool)->[a]->[[a]]
= foldl parts []
equivalentClasses eq where parts [] a = [[a]]
:xs) a
parts (x| eq (head x) a = (a:x):xs
| otherwise = x:parts xs a
buildAutomaton :: (Monoid b,Eq a) => [([a],b)] -> Automaton a b
= automaton
buildAutomaton xs where automaton = build (const automaton) xs mempty
build :: (Monoid b,Eq a)=> (a -> Automaton a b) -> [([a],b)] -> b -> Automaton a b
= node
build trans xs out where node = Node (\x->fromMaybe (trans x) (lookup x table)) out
= map transPair $ equivalentClasses (on (==) (head . fst)) xs
table = (a, build (delta (trans a)) ys out)
transPair xs where a = head $ fst $ head xs
= partition (not . null . fst) $ map (first tail) xs
(ys,zs) = mappend (mconcat $ map snd zs) (output $ trans a)
out
match :: Eq a => Automaton a b -> [a] -> [b]
= map output $ scanl delta a xs
match a xs
match' :: Eq a => [[a]] -> [a] -> [[[a]]]
= match (buildAutomaton $ map (\x-> (x,[x])) pat)
match' pat
isInfixOf' :: Eq a => [a] -> [a] -> Bool
= getAll $ mconcat $ match (buildAutomaton [(xs, All True)]) ys isInfixOf' xs ys