The Aho–Corasick Automaton in Haskell
It is common that one might want to match different strings against one single text of length . One can of course apply the KMP algorithm individually, and result an algorithm that runs in time.
Faster algorithms are known. The idea is to build an finite state transducer that can output which strings is the suffix of the string it read. The Aho-Corasick automaton is a compressed version of such transducer, as the size does not depend on the size of the alphabet.
import Control.Arrow (first)
import Data.Function (on)
import Data.List (lookup, partition)
import Data.Maybe (fromMaybe, Maybe (..))
import Data.Monoid (All (..), Monoid, getAll, mappend, mconcat,
mempty)
data Automaton a b = Node {delta :: a -> Automaton a b,
output :: b
}
equivalentClasses :: (a->a->Bool)->[a]->[[a]]
equivalentClasses eq = foldl parts []
where parts [] a = [[a]]
parts (x:xs) a
| eq (head x) a = (a:x):xs
| otherwise = x:parts xs a
buildAutomaton :: (Monoid b,Eq a) => [([a],b)] -> Automaton a b
buildAutomaton xs = automaton
where automaton = build (const automaton) xs mempty
build :: (Monoid b,Eq a)=> (a -> Automaton a b) -> [([a],b)] -> b -> Automaton a b
build trans xs out = node
where node = Node (\x->fromMaybe (trans x) (lookup x table)) out
table = map transPair $ equivalentClasses (on (==) (head . fst)) xs
transPair xs = (a, build (delta (trans a)) ys out)
where a = head $ fst $ head xs
(ys,zs) = partition (not . null . fst) $ map (first tail) xs
out = mappend (mconcat $ map snd zs) (output $ trans a)
match :: Eq a => Automaton a b -> [a] -> [b]
match a xs = map output $ scanl delta a xs
match' :: Eq a => [[a]] -> [a] -> [[[a]]]
match' pat = match (buildAutomaton $ map (\x-> (x,[x])) pat)
isInfixOf' :: Eq a => [a] -> [a] -> Bool
isInfixOf' xs ys = getAll $ mconcat $ match (buildAutomaton [(xs, All True)]) ys