{-# LANGUAGE BangPatterns, NamedFieldPuns #-} module Data.SearchEngine.SearchIndex ( SearchIndex, Term, TermId, DocId, emptySearchIndex, insertDoc, deleteDoc, docCount, lookupTerm, lookupTermsByPrefix, lookupTermId, lookupDocId, lookupDocKey, getTerm, getDocKey, invariant, ) where import Data.SearchEngine.DocIdSet (DocIdSet, DocId) import qualified Data.SearchEngine.DocIdSet as DocIdSet import Data.SearchEngine.DocTermIds (DocTermIds, TermId, vecIndexIx, vecCreateIx) import qualified Data.SearchEngine.DocTermIds as DocTermIds import Data.SearchEngine.DocFeatVals (DocFeatVals) import qualified Data.SearchEngine.DocFeatVals as DocFeatVals import Data.Ix (Ix) import qualified Data.Ix as Ix import Data.Map (Map) import qualified Data.Map as Map import Data.IntMap (IntMap) import qualified Data.IntMap as IntMap import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T import Data.List (foldl') import Control.Exception (assert) -- | Terms are short strings, usually whole words. -- type Term = Text -- | The search index is essentially a many-to-many mapping between documents -- and terms. Each document contains many terms and each term occurs in many -- documents. It is a bidirectional mapping as we need to support lookups in -- both directions. -- -- Documents are identified by a key (in Ord) while terms are text values. -- Inside the index however we assign compact numeric ids to both documents and -- terms. The advantage of this is a much more compact in-memory representation -- and the disadvantage is greater complexity. In particular it means we have -- to manage bidirectional mappings between document keys and ids, and between -- terms and term ids. -- -- So the mappings we maintain can be depicted as: -- -- > Term <-- 1:1 --> TermId -- > \ ^ -- > \ | -- > 1:many many:many -- > \ | -- > \-> v -- > DocKey <-- 1:1 --> DocId -- -- For efficiency, these details are exposed in the interface. In particular -- the mapping from TermId to many DocIds is exposed via a 'DocIdSet', -- and the mapping from DocIds to TermIds is exposed via 'DocTermIds'. -- -- The main reason we need to keep the DocId -> TermId is to allow for -- efficient incremental updates. -- data SearchIndex key field feature = SearchIndex { -- the indexes termMap :: !(Map Term TermInfo), termIdMap :: !(IntMap TermIdInfo), docIdMap :: !(IntMap (DocInfo key field feature)), docKeyMap :: !(Map key DocId), -- auto-increment key counters nextTermId :: TermId, nextDocId :: DocId } deriving Show data TermInfo = TermInfo !TermId !DocIdSet deriving Show data TermIdInfo = TermIdInfo !Term !DocIdSet deriving (Show, Eq) data DocInfo key field feature = DocInfo !key !(DocTermIds field) !(DocFeatVals feature) deriving Show ----------------------- -- SearchIndex basics -- emptySearchIndex :: SearchIndex key field feature emptySearchIndex = SearchIndex Map.empty IntMap.empty IntMap.empty Map.empty minBound minBound checkInvariant :: (Ord key, Ix field, Bounded field) => SearchIndex key field feature -> SearchIndex key field feature checkInvariant si = assert (invariant si) si invariant :: (Ord key, Ix field, Bounded field) => SearchIndex key field feature -> Bool invariant SearchIndex{termMap, termIdMap, docKeyMap, docIdMap} = and [ IntMap.lookup (fromEnum termId) termIdMap == Just (TermIdInfo term docidset) | (term, (TermInfo termId docidset)) <- Map.assocs termMap ] && and [ case Map.lookup term termMap of Just (TermInfo termId' docidset') -> toEnum termId == termId' && docidset == docidset' Nothing -> False | (termId, (TermIdInfo term docidset)) <- IntMap.assocs termIdMap ] && and [ case IntMap.lookup (fromEnum docId) docIdMap of Just (DocInfo docKey' _ _) -> docKey == docKey' Nothing -> False | (docKey, docId) <- Map.assocs docKeyMap ] && and [ Map.lookup docKey docKeyMap == Just (toEnum docId) | (docId, DocInfo docKey _ _) <- IntMap.assocs docIdMap ] && and [ DocIdSet.invariant docIdSet | (_term, (TermInfo _ docIdSet)) <- Map.assocs termMap ] && and [ any (\field -> DocTermIds.fieldTermCount docterms field termId > 0) fields | (_term, (TermInfo termId docIdSet)) <- Map.assocs termMap , docId <- DocIdSet.toList docIdSet , let DocInfo _ docterms _ = docIdMap IntMap.! fromEnum docId ] && and [ IntMap.member (fromEnum termid) termIdMap | (_docId, DocInfo _ docTerms _) <- IntMap.assocs docIdMap , field <- fields , termid <- DocTermIds.fieldElems docTerms field ] where fields = Ix.range (minBound, maxBound) ------------------- -- Lookups -- docCount :: SearchIndex key field feature -> Int docCount SearchIndex{docIdMap} = IntMap.size docIdMap lookupTerm :: SearchIndex key field feature -> Term -> Maybe (TermId, DocIdSet) lookupTerm SearchIndex{termMap} term = case Map.lookup term termMap of Nothing -> Nothing Just (TermInfo termid docidset) -> Just (termid, docidset) lookupTermsByPrefix :: SearchIndex key field feature -> Term -> [(TermId, DocIdSet)] lookupTermsByPrefix SearchIndex{termMap} term = [ (termid, docidset) | (TermInfo termid docidset) <- lookupPrefix term termMap ] lookupTermId :: SearchIndex key field feature -> TermId -> DocIdSet lookupTermId SearchIndex{termIdMap} termid = case IntMap.lookup (fromEnum termid) termIdMap of Nothing -> error $ "lookupTermId: not found " ++ show termid Just (TermIdInfo _ docidset) -> docidset lookupDocId :: SearchIndex key field feature -> DocId -> (key, DocTermIds field, DocFeatVals feature) lookupDocId SearchIndex{docIdMap} docid = case IntMap.lookup (fromEnum docid) docIdMap of Nothing -> errNotFound Just (DocInfo key doctermids docfeatvals) -> (key, doctermids, docfeatvals) where errNotFound = error $ "lookupDocId: not found " ++ show docid lookupDocKey :: Ord key => SearchIndex key field feature -> key -> Maybe (DocTermIds field) lookupDocKey SearchIndex{docKeyMap, docIdMap} key = do case Map.lookup key docKeyMap of Nothing -> Nothing Just docid -> case IntMap.lookup (fromEnum docid) docIdMap of Nothing -> error "lookupDocKey: internal error" Just (DocInfo _key doctermids _) -> Just doctermids getTerm :: SearchIndex key field feature -> TermId -> Term getTerm SearchIndex{termIdMap} termId = case termIdMap IntMap.! fromEnum termId of TermIdInfo term _ -> term getTermId :: SearchIndex key field feature -> Term -> TermId getTermId SearchIndex{termMap} term = case termMap Map.! term of TermInfo termid _ -> termid getDocKey :: SearchIndex key field feature -> DocId -> key getDocKey SearchIndex{docIdMap} docid = case docIdMap IntMap.! fromEnum docid of DocInfo dockey _ _ -> dockey getDocTermIds :: SearchIndex key field feature -> DocId -> DocTermIds field getDocTermIds SearchIndex{docIdMap} docid = case docIdMap IntMap.! fromEnum docid of DocInfo _ doctermids _ -> doctermids -------------------- -- Insert & delete -- -- Procedure for adding a new doc... -- (key, field -> [Term]) -- alloc docid for key -- add term occurences for docid (include rev map for termid) -- construct indexdoc now that we have all the term -> termid entries -- insert indexdoc -- Procedure for updating a doc... -- (key, field -> [Term]) -- find docid for key -- lookup old terms for docid (using termid rev map) -- calc term occurrences to add, term occurrences to delete -- add new term occurrences, delete old term occurrences -- construct indexdoc now that we have all the term -> termid entries -- insert indexdoc -- Procedure for deleting a doc... -- (key, field -> [Term]) -- find docid for key -- lookup old terms for docid (using termid rev map) -- delete old term occurrences -- delete indexdoc -- | This is the representation for documents to be added to the index. -- Documents may -- type DocTerms field = field -> [Term] type DocFeatureValues feature = feature -> Float insertDoc :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) => key -> DocTerms field -> DocFeatureValues feature -> SearchIndex key field feature -> SearchIndex key field feature insertDoc key userDocTerms userDocFeats si@SearchIndex{docKeyMap} | Just docid <- Map.lookup key docKeyMap = -- Some older version of the doc is already present in the index, -- So we keep its docid. Now have to update the doc itself -- and update the terms by removing old ones and adding new ones. let oldTermsIds = getDocTermIds si docid userDocTerms' = memoiseDocTerms userDocTerms newTerms = docTermSet userDocTerms' oldTerms = docTermIdsTermSet si oldTermsIds -- We optimise for the typical case of significant overlap between -- the terms in the old and new versions of the document. delTerms = oldTerms `Set.difference` newTerms addTerms = newTerms `Set.difference` oldTerms -- Note: adding the doc relies on all the terms being in the termMap -- already, so we first add all the term occurences for the docid. in checkInvariant . insertDocIdToDocEntry docid key userDocTerms' userDocFeats . insertTermToDocIdEntries (Set.toList addTerms) docid . deleteTermToDocIdEntries (Set.toList delTerms) docid $ si | otherwise = -- We're dealing with a new doc, so allocate a docid for the key let (si', docid) = allocFreshDocId si userDocTerms' = memoiseDocTerms userDocTerms addTerms = docTermSet userDocTerms' -- Note: adding the doc relies on all the terms being in the termMap -- already, so we first add all the term occurences for the docid. in checkInvariant . insertDocIdToDocEntry docid key userDocTerms' userDocFeats . insertDocKeyToIdEntry key docid . insertTermToDocIdEntries (Set.toList addTerms) docid $ si' deleteDoc :: (Ord key, Ix field, Bounded field) => key -> SearchIndex key field feature -> SearchIndex key field feature deleteDoc key si@SearchIndex{docKeyMap} | Just docid <- Map.lookup key docKeyMap = let oldTermsIds = getDocTermIds si docid oldTerms = docTermIdsTermSet si oldTermsIds in checkInvariant . deleteDocEntry docid key . deleteTermToDocIdEntries (Set.toList oldTerms) docid $ si | otherwise = si ---------------------------------- -- Insert & delete support utils -- memoiseDocTerms :: (Ix field, Bounded field) => DocTerms field -> DocTerms field memoiseDocTerms docTermsFn = \field -> vecIndexIx vec field where vec = vecCreateIx docTermsFn docTermSet :: (Bounded t, Ix t) => DocTerms t -> Set.Set Term docTermSet docterms = Set.unions [ Set.fromList (docterms field) | field <- Ix.range (minBound, maxBound) ] docTermIdsTermSet :: (Bounded field, Ix field) => SearchIndex key field feature -> DocTermIds field -> Set.Set Term docTermIdsTermSet si doctermids = Set.unions [ Set.fromList terms | field <- Ix.range (minBound, maxBound) , let termids = DocTermIds.fieldElems doctermids field terms = map (getTerm si) termids ] -- -- The Term <-> DocId mapping -- -- | Add an entry into the 'Term' to 'DocId' mapping. insertTermToDocIdEntry :: Term -> DocId -> SearchIndex key field feature -> SearchIndex key field feature insertTermToDocIdEntry term !docid si@SearchIndex{termMap, termIdMap, nextTermId} = case Map.lookup term termMap of Nothing -> let docIdSet' = DocIdSet.singleton docid !termInfo' = TermInfo nextTermId docIdSet' !termIdInfo' = TermIdInfo term docIdSet' in si { termMap = Map.insert term termInfo' termMap , termIdMap = IntMap.insert (fromEnum nextTermId) termIdInfo' termIdMap , nextTermId = succ nextTermId } Just (TermInfo termId docIdSet) -> let docIdSet' = DocIdSet.insert docid docIdSet !termInfo' = TermInfo termId docIdSet' !termIdInfo' = TermIdInfo term docIdSet' in si { termMap = Map.insert term termInfo' termMap , termIdMap = IntMap.insert (fromEnum termId) termIdInfo' termIdMap } -- | Add multiple entries into the 'Term' to 'DocId' mapping: many terms that -- map to the same document. insertTermToDocIdEntries :: [Term] -> DocId -> SearchIndex key field feature -> SearchIndex key field feature insertTermToDocIdEntries terms !docid si = foldl' (\si' term -> insertTermToDocIdEntry term docid si') si terms -- | Delete an entry from the 'Term' to 'DocId' mapping. deleteTermToDocIdEntry :: Term -> DocId -> SearchIndex key field feature -> SearchIndex key field feature deleteTermToDocIdEntry term !docid si@SearchIndex{termMap, termIdMap} = case Map.lookup term termMap of Nothing -> si Just (TermInfo termId docIdSet) -> let docIdSet' = DocIdSet.delete docid docIdSet !termInfo' = TermInfo termId docIdSet' !termIdInfo' = TermIdInfo term docIdSet' in if DocIdSet.null docIdSet' then si { termMap = Map.delete term termMap , termIdMap = IntMap.delete (fromEnum termId) termIdMap } else si { termMap = Map.insert term termInfo' termMap , termIdMap = IntMap.insert (fromEnum termId) termIdInfo' termIdMap } -- | Delete multiple entries from the 'Term' to 'DocId' mapping: many terms -- that map to the same document. deleteTermToDocIdEntries :: [Term] -> DocId -> SearchIndex key field feature -> SearchIndex key field feature deleteTermToDocIdEntries terms !docid si = foldl' (\si' term -> deleteTermToDocIdEntry term docid si') si terms -- -- The DocId <-> Doc mapping -- allocFreshDocId :: SearchIndex key field feature -> (SearchIndex key field feature, DocId) allocFreshDocId si@SearchIndex{nextDocId} = let !si' = si { nextDocId = succ nextDocId } in (si', nextDocId) insertDocKeyToIdEntry :: Ord key => key -> DocId -> SearchIndex key field feature -> SearchIndex key field feature insertDocKeyToIdEntry dockey !docid si@SearchIndex{docKeyMap} = si { docKeyMap = Map.insert dockey docid docKeyMap } insertDocIdToDocEntry :: (Ix field, Bounded field, Ix feature, Bounded feature) => DocId -> key -> DocTerms field -> DocFeatureValues feature -> SearchIndex key field feature -> SearchIndex key field feature insertDocIdToDocEntry !docid dockey userdocterms userdocfeats si@SearchIndex{docIdMap} = let doctermids = DocTermIds.create (map (getTermId si) . userdocterms) docfeatvals= DocFeatVals.create userdocfeats !docinfo = DocInfo dockey doctermids docfeatvals in si { docIdMap = IntMap.insert (fromEnum docid) docinfo docIdMap } deleteDocEntry :: Ord key => DocId -> key -> SearchIndex key field feature -> SearchIndex key field feature deleteDocEntry docid key si@SearchIndex{docIdMap, docKeyMap} = si { docIdMap = IntMap.delete (fromEnum docid) docIdMap , docKeyMap = Map.delete key docKeyMap } -- -- Data.Map utils -- -- Data.Map does not support prefix lookups directly (unlike a trie) -- but we can implement it reasonably efficiently using split: -- | Lookup values for a range of keys (inclusive lower bound and exclusive -- upper bound) -- lookupRange :: Ord k => (k, k) -> Map k v -> [v] lookupRange (lb, ub) m = let (_, mv, gt) = Map.splitLookup lb m (between, _) = Map.split ub gt in case mv of Just v -> v : Map.elems between Nothing -> Map.elems between lookupPrefix :: Text -> Map Text v -> [v] lookupPrefix t _ | T.null t = [] lookupPrefix t m = lookupRange (t, prefixUpperBound t) m prefixUpperBound :: Text -> Text prefixUpperBound = succLast . T.dropWhileEnd (== maxBound) where succLast t = T.init t `T.snoc` succ (T.last t)