> {-# LANGUAGE ForeignFunctionInterface, ScopedTypeVariables, EmptyDataDecls, BangPatterns #-} > module Data.PerfectHash ( PerfectHash, fromList, lookup, lookupByIndex ) where > import Array > import Data.Array.IO > import Foreign > import Foreign.C.String > import Foreign.C.Types > import Foreign.Marshal.Array > import Prelude hiding (lookup) > import System.IO.Unsafe > import qualified Data.ByteString.Char8 as S > import qualified Data.ByteString.Unsafe as Unsafe > import Data.Array.Storable > import Data.Digest.CRC32 (crc32) > import Data.Digest.Adler32 (adler32) > import Control.Monad(guard) Arguably the FFI stuff should be in a separate file, but let's keep it simple for the moment. > foreign import ccall unsafe "cmph.h cmph_search" c_cmph_search :: Ptr ForeignHash -> CString -> CInt -> CULong > foreign import ccall unsafe "stub.h build_hash" c_build_hash :: Ptr CString -> CInt -> IO (Ptr ForeignHash) > foreign import ccall unsafe "string.h strdup" c_strdup :: CString -> IO CString > foreign import ccall unsafe "string.h strncmp" c_strncmp :: CString -> CString -> CInt -> IO CInt standard idiom for an opaque type > data ForeignHash > data PerfectHash a = PerfectHash { store :: !(Array.Array Word32 (a,CString)), > hashFunc :: !(S.ByteString -> Word32) } pretty certain this could be improved > use_hash a str = {-# SCC "use_hash" #-} unsafePerformIO $ Unsafe.unsafeUseAsCStringLen str > (\(cstr,i) -> return (fromIntegral $ c_cmph_search a cstr (fromIntegral i))) > raw_hashfunc hash cstr len = fromIntegral $ c_cmph_search hash cstr len This could do with being broken up a little, probably > fromList :: Show a => [(S.ByteString, a)] -> PerfectHash a > fromList ls = unsafePerformIO $ do > let len = length ls > arr <- newArray_ (0, len-1) > -- we make one pass over ls, then throw it away > cstr_ptrs <- mapM (\(i,(bs, val)) -> > S.useAsCStringLen bs $ \(cstr,len) -> do > newPtr <- c_strdup cstr > writeArray arr i newPtr > return (newPtr,fromIntegral len,val)) > (zip [0..] ls) > cmph <- withStorableArray arr $ \ptr -> c_build_hash ptr (fromIntegral len) > let bounds :: (Word32, Word32) = (fromIntegral 0, fromIntegral len - 1) > arr <- newArray_ bounds :: IO (IOArray Word32 a) > checksums <- newArray_ bounds :: IO (IOArray Word32 Word32) > let hashFunc = use_hash cmph > mapM_ (\(cstr,len,val) -> do > let index = raw_hashfunc cmph cstr len > writeArray arr index (val,cstr)) $ cstr_ptrs > i_arr <- freeze arr > return PerfectHash { store = i_arr, > hashFunc = use_hash cmph } > -- > crc = crc32 -- > crc = adler32 > lookup :: PerfectHash a -> S.ByteString -> Maybe a > lookup !hash !bs > | check = Just e > | otherwise = Nothing > where index = {-# SCC "hash_only" #-} hashFunc hash bs > (!low, !high) = Array.bounds arr > !arr = store hash > (e, str) = arr ! index > !check = {-# SCC "check" #-} low <= index && high >= index && > {-# SCC "bs_check" #-} unsafePerformIO $ Unsafe.unsafeUseAsCStringLen bs (\(cstr,len) -> > c_strncmp str cstr (fromIntegral len) >>= \res -> return (res == 0)) sometimes it's convenient to have direct access, but this should probably be seen as a slightly devious use case. > lookupByIndex :: PerfectHash a -> Word32 -> Maybe S.ByteString > lookupByIndex hash index = do > guard $ low <= index && high >= index > return $ unsafePerformIO $ S.packCString $ snd $ arr ! index > where (!low, !high) = bounds arr > arr = store hash