{- This code implements a Monoid for string matching; i.e. a data structure, MatchIdxs, which has the result of a string match function plus a little extra information for monoidally combining things together; the purpose of course is to parallelize string matching computations. This is the original version I wrote which uses type level strings to guarantee that only computations done with the same target string are combinable. The invariants I'd like to prove are in the comments on the MatchIdxs constructors. I'd be interested in a way to do it directly with this code (though it seems that would require extending LH), or doing it in a version of this code where the constructors have an extra argument with the value of the target-- however that formulation seems to raise issues about forcing those values to be the same. I have a version of the code lying around which tries this approach, but I couldn't get it to work. I chose to send this version first so you have the chance to see where I started; I could send you my version with the explicit terms if you like. If this goes well, then there are a couple of further things to think about: 1) I have another Monoid construction for actual splitting, but I don't think LH can handle the invariants as they require an isInfixOf operation; but I could be wrong about what LH can do. 2) It would be interesting to think about whether one can encode correctness, rather than just some invariants. By correctness I mean both that the monoid laws are satisfied, and that there is a homomorphic-like property to the effect of: match (x <> y) == match x <> match y Let me know if you have any questions about the code, or need more comments/explanation. -} {- LIQUID "--diff" @-} {-@ LIQUID "--scrape-used-imports" @-} {-@ LIQUID "--short-names" @-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} module MatchIdxs where import qualified Data.ByteString as BS -- RJ import qualified Data.ByteString.Search as BS import Data.Char -- RJ import Data.List.Split(chunksOf) import Data.Monoid import Data.Proxy import Debug.Trace import GHC.TypeLits import Language.Haskell.Liquid.Prelude (liquidAssert) import Prelude hiding (head, min, max) junk = BS.head traceMsg msg x = trace (msg ++ show x) x {-@ chunksBS :: Int -> b:BS.ByteString -> [BS.ByteString] / [(bLength b)] @-} chunksBS n' xs | BS.null xs = [] | otherwise = x : chunksBS n xs' where (x,xs') = BS.splitAt (liquidAssert (n > 0) n) xs n = max 1 n' bsToString :: BS.ByteString -> String bsToString = map (chr . fromIntegral) . BS.unpack stringToBS :: String -> BS.ByteString stringToBS = BS.pack . map (fromIntegral . ord) -- | get the (proxied and existentially boxed) type level Symbol for a bytestring someSymbolValBS :: BS.ByteString -> SomeSymbol someSymbolValBS = someSymbolVal . bsToString -- | get the bytestring corresponding to a type level Symbol mkTarg :: forall t . KnownSymbol t => Proxy t -> BS.ByteString mkTarg = stringToBS . symbolVal -- | Naive specification of string matching (from Bird) {-@ indicesSpec :: t:ByteStringNE -> s:BS.ByteString -> [OkPos t s] @-} -- indicesSpec targ = map ((BS.length targ -) . BS.length) . filter (targ `BS.isSuffixOf`) . BS.inits indicesSpec targ s = [ BS.length s' - BS.length targ | s' <- BS.inits s , targ `BS.isSuffixOf` s' ] indicesSpec :: BS.ByteString -> BS.ByteString -> [Int] -- | Datatype to name string matching algorithms; will use it's lifted -- version put choice in type. data Alg = BM -- ^ Boyer-Moore from stringsearch package | Spec -- ^ Naive spec {-@ indices :: Alg -> t:ByteStringNE -> s:BS.ByteString -> [OkPos t s] @-} indices BM = assumeIndices -- RJ BS.indices indices Spec = indicesSpec -- | Monoid -- -- 'MatchIdxs alg targ' denotes the result of running string matching -- algorithm 'alg' search for target 'targ' in some input. In addition -- to the match indices, information needed to combine this result -- with similar results on input to the left and right are also -- included. -- -- We'd like to prove that the invariants in the comments hold (|x| -- denotes the length of x). data MatchIdxs = Small { targ :: BS.ByteString , bs :: BS.ByteString } | MatchIdxs { targ :: BS.ByteString , input :: Int , left :: BS.ByteString , matches :: [Int] , right :: BS.ByteString } deriving (Eq, Show) {-@ data MatchIdxs = Small { targ :: ByteStringNE , bs :: {b:BS.ByteString | bLength b < bLength targ} } | MatchIdxs { targ :: ByteStringNE , input :: {i : Int | i >= bLength targ} , left :: {l : BS.ByteString | bLength l == bLength targ - 1} , matches :: [{v : Int | v <= input - bLength targ}] , right :: {r : BS.ByteString | bLength r == bLength targ - 1} } @-} matchIdxsIs :: MatchIdxs -> [Int] matchIdxsIs (Small _ _) = [] matchIdxsIs (MatchIdxs _ _ _ is _) = is -- | create a 'MatchIdxs' {-@ myIndices :: Alg -> t:ByteStringNE -> BS.ByteString -> MatchIdxsT t @-} myIndices alg t bs | BS.length bs > fringeLen = let right1 = BS.drop (BS.length bs - fringeLen) bs in MatchIdxs t (BS.length bs) left is right1 | otherwise = Small t bs where is = indices alg t bs fringeLen = BS.length t - 1 left = BS.take fringeLen bs -- right1 = BS.drop (BS.length bs - fringeLen) bs -- ISSUE: get contextual output with --diff -- ISSUE: why does lazyvar right1 not work? it drops the output type on right1! {- lazyvar right1 -} {-@ type OkPos Targ Str = {v:Nat | v <= bLength Str - bLength Targ} @-} {-@ type ByteStringNE = {v:BS.ByteString | bLength v > 0 } @-} {-@ type ByteStringN N = {v:BS.ByteString | bLength v == N} @-} {-@ type MatchIdxsT T = {v:MatchIdxs | targ v == T} @-} {-@ assume BS.isSuffixOf :: targ:_ -> s:_ -> {v:_ | v => (bLength targ <= bLength s) } @-} {-@ assume BS.length :: b:BS.ByteString -> {v:Nat | v == bLength b} @-} {-@ assume BS.empty :: {v:BS.ByteString | bLength v == 0} @-} {-@ assume BS.take :: n:Nat -> b:BS.ByteString -> ByteStringN {min n (bLength b)} @-} {-@ assume BS.drop :: n:Nat -> b:{BS.ByteString | n <= bLength b} -> ByteStringN {bLength b - n} @-} {-@ assume BS.inits :: b:BS.ByteString -> [{v:BS.ByteString | bLength v <= bLength b}] @-} {-@ assume BS.append :: b1:BS.ByteString -> b2:BS.ByteString -> ByteStringN {bLength b1 + bLength b2} @-} {-@ assume BS.null :: b:BS.ByteString -> {v:Bool | v <=> (bLength b == 0)} @-} {-@ assume BS.splitAt :: n:Nat -> b:BS.ByteString -> (ByteStringN {min n (bLength b)}, ByteStringN {max 0 (bLength b - n)}) @-} {-@ assume BS.head :: BS.ByteString -> Data.Word.Word8 @-} {-@ measure target @-} target :: MatchIdxs -> BS.ByteString target (Small t _) = t target (MatchIdxs t _ _ _ _) = t {-@ inline min @-} min :: Int -> Int -> Int min x y = if x <= y then x else y {-@ inline max @-} max :: Int -> Int -> Int max x y = if x <= y then y else x -- RJ instance (KnownSymbol t, StringMatch alg) => Monoid (MatchIdxs alg t) where {-@ mmempty :: t:ByteStringNE -> MatchIdxsT t @-} mmempty t = Small t BS.empty {-@ mmconcat :: (Foldable t) => Alg -> tg:ByteStringNE -> t (MatchIdxsT tg) -> (MatchIdxsT tg) @-} mmconcat alg t = foldr (mmappend alg t) (mmempty t) {-@ qualif BB(v:Int, n:Int, d:Int, b:BS.ByteString): v <= (n + d) - bLength b @-} {-@ mmappend :: Alg -> t:ByteStringNE -> MatchIdxsT t -> MatchIdxsT t -> MatchIdxsT t @-} mmappend alg t mx my = let fringeLen = BS.length t - 1 idxFun = indices alg t in case (mx, my) of (Small tx x, Small _ y) -> myIndices alg tx (x <> y) (Small tx x, MatchIdxs _ yLen ly iy rt) -> MatchIdxs tx xyLen lt is rt where xyLen = xLen + yLen xLen = BS.length x xly = BS.append x ly lt = BS.take fringeLen xly is = idxFun xly ++ map (+ xLen) iy (MatchIdxs tx xLen lt ix rx, Small ty y) -> MatchIdxs tx xyLen lt (ix ++ is) rt where xyLen = xLen + yLen yLen = BS.length y is = map (+ (xLen - fringeLen)) (idxFun rxy) rt = BS.drop (BS.length rxy - fringeLen) rxy rxy = BS.append rx y (MatchIdxs tx xLen lt ix rx, MatchIdxs ty yLen ly iy rt) -> MatchIdxs tx xyLen lt (ix ++ is) rt where xyLen = xLen + yLen is = ixy ++ map (+ xLen) iy ixy = map (+ (xLen - fringeLen)) $ idxFun (BS.append rx ly) -- | Example applications -- -- The bufLen and chunkSz arguments are there to exercise the monoid, -- though they also foreshadow a parallel implementation. {-@ indicesBS' :: Alg -> Int -> Int -> ByteStringNE -> BS.ByteString -> [Int] @-} indicesBS' alg bufLen chunkSz t bs = let si = mmconcat alg t . map (mmconcat alg t) . chunksOf bufLen . map (myIndices alg t) $ chunksBS chunkSz bs in matchIdxsIs si {-@ indicesBS, indicesNaive :: Int -> Int -> ByteStringNE -> BS.ByteString -> [Int] @-} indicesBS = indicesBS' BM -- RJ (Proxy :: Proxy BM) indicesNaive = indicesBS' Spec -- RJ (Proxy :: Proxy Spec) {-@ isInfixOfBS :: Int -> Int -> ByteStringNE -> BS.ByteString -> Bool @-} isInfixOfBS bufLen chunkSz t = not . null . indicesBS bufLen chunkSz t ------------------------------------------------------------------------------------------ {-@ invariant {v:BS.ByteString | 0 <= bLength v } @-} {-@ measure bLength :: BS.ByteString -> Int @-} {-@ type LNat N = {v:Nat | v < N} @-} chunksOf :: Int -> [a] -> [[a]] chunksOf = undefined {-@ assumeIndices :: t:ByteStringNE -> s:BS.ByteString -> [OkPos t s] @-} assumeIndices :: BS.ByteString -> BS.ByteString -> [Int] assumeIndices = undefined